From 1b7ae0b673638062fc99a6605df6212b6d35eb06 Mon Sep 17 00:00:00 2001
From: clf <53817093+clingfei@users.noreply.github.com>
Date: Fri, 28 Nov 2025 03:17:00 +0800
Subject: [PATCH 01/13] [AArch64] Use umin for x != 0 when +cssc is enabled
 (#169159)

Closes https://github.com/llvm/llvm-project/issues/161584
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   7 +-
 .../test/CodeGen/AArch64/aarch64-isel-umin.ll | 336 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll     |   4 +-
 3 files changed, 344 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-isel-umin.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d379a28ea5523..694f625c094d4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11738,7 +11738,12 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (LHS.getValueType().isInteger()) {
-
+    if (Subtarget->hasCSSC() && CC == ISD::SETNE && isNullConstant(RHS)) {
+      SDValue One = DAG.getConstant(1, DL, LHS.getValueType());
+      SDValue UMin = DAG.getNode(ISD::UMIN, DL, LHS.getValueType(), LHS, One);
+      SDValue Res = DAG.getZExtOrTrunc(UMin, DL, VT);
+      return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
+    }
     simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
 
     SDValue CCVal;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-isel-umin.ll b/llvm/test/CodeGen/AArch64/aarch64-isel-umin.ll
new file mode 100644
index 0000000000000..261f2cdad50e0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-isel-umin.ll
@@ -0,0 +1,336 @@
+; RUN: llc -mtriple=aarch64-- -o - < %s | FileCheck %s --check-prefix=CHECK-SD
+; RUN: llc -mtriple=aarch64-- -mattr=+cssc -o - < %s | FileCheck %s --check-prefix=CHECK-CSSC
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; auto icmpi64(long x0) { return x0 != 0; }
+define i1 @icmpi64(i64 noundef %0) {
+; CHECK-SD-LABEL: icmpi64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp x0, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: icmpi64:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    umin x0, x0, #1
+; CHECK-CSSC-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %2 = icmp ne i64 %0, 0
+  ret i1 %2
+}
+
+; auto icmpi32(int x0) { return x0 != 0; }
+define i1 @icmpi32(i32 noundef %0) {
+; CHECK-SD-LABEL: icmpi32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: icmpi32:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    umin w0, w0, #1
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %2 = icmp ne i32 %0, 0
+  ret i1 %2
+}
+
+; auto icmpi16(short x0) { return x0 != 0; }
+define i1 @icmpi16(i16 noundef %0) {
+; CHECK-SD-LABEL: icmpi16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst w0, #0xffff
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: icmpi16:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    and	w8, w0, #0xffff
+; CHECK-CSSC-NEXT:    umin w0, w8, #1
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %2 = icmp ne i16 %0, 0
+  ret i1 %2
+}
+
+; auto icmpi8(char x0) { return x0 != 0; }
+define i1 @icmpi8(i8 noundef %0) {
+; CHECK-SD-LABEL: icmpi8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst w0, #0xff
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: icmpi8:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-CSSC-NEXT:    umin w0, w8, #1
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %2 = icmp ne i8 %0, 0
+  ret i1 %2
+}
+
+; unsigned long icmpi64i8(char x0) { return x0 != 0; }
+define i64 @icmpi64i8(i8 noundef %0) {
+; CHECK-SD-LABEL: icmpi64i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst w0, #0xff
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: icmpi64i8:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    and	w8, w0, #0xff
+; CHECK-CSSC-NEXT:    umin w0, w8, #1
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %1 = icmp ne i8 %0, 0
+  %2 = zext i1 %1 to i64
+  ret i64 %2
+}
+
+; unsigned long setcc_i8_i64(char x0) { return x0 != 0; }
+define i8 @setcc_i8_i64(i64 %x) {
+; CHECK-SD-LABEL: setcc_i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp	x0, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_i8_i64:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    umin x0, x0, #1
+; CHECK-CSSC-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne i64 %x, 0
+  %conv = zext i1 %cmp to i8
+  ret i8 %conv
+}
+
+; short setcc_i16_i32(int x0) { return x0 != 0; }
+define i16 @setcc_i16_i32(i32 %x) {
+; CHECK-SD-LABEL: setcc_i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp	w0, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_i16_i32:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    umin w0, w0, #1
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne i32 %x, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+; int setcc_i32_i64(unsigned long x0) { return x0 != 0; }
+define i32 @setcc_i32_i64(i64 %x) {
+; CHECK-SD-LABEL: setcc_i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp	x0, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_i32_i64:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    umin x0, x0, #1
+; CHECK-CSSC-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne i64 %x, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; unsigned long setcc_i64_i64(unsigned long x0) { return x0 != 0; }
+define i64 @setcc_i64_i64(i64 %x) {
+; CHECK-SD-LABEL: setcc_i64_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp	x0, #0
+; CHECK-SD-NEXT:    cset	w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_i64_i64:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    umin	x0, x0, #1
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne i64 %x, 0
+  %conv = zext i1 %cmp to i64
+  ret i64 %conv
+}
+
+define <2 x i1> @setcc_v2i1_v2i64(<2 x i64> %x) {
+; CHECK-SD-LABEL: setcc_v2i1_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmtst	v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    xtn	v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_v2i1_v2i64:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    cmtst	v0.2d, v0.2d, v0.2d
+; CHECK-CSSC-NEXT:    xtn	v0.2s, v0.2d
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne <2 x i64> %x, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @setcc_v4i1_v4i32(<4 x i32> %x) {
+; CHECK-SD-LABEL: setcc_v4i1_v4i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmtst	v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    xtn	v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_v4i1_v4i32:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    cmtst	v0.4s, v0.4s, v0.4s
+; CHECK-CSSC-NEXT:    xtn	v0.4h, v0.4s
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne <4 x i32> %x, zeroinitializer
+  ret <4 x i1> %cmp
+}
+
+define <8 x i1> @setcc_v8i1_v8i16(<8 x i16> %x) {
+; CHECK-SD-LABEL: setcc_v8i1_v8i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmtst	v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    xtn	v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_v8i1_v8i16:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    cmtst	v0.8h, v0.8h, v0.8h
+; CHECK-CSSC-NEXT:    xtn	v0.8b, v0.8h
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne <8 x i16> %x, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <16 x i1> @setcc_v16i1_v16i8(<16 x i8> %x) {
+; CHECK-SD-LABEL: setcc_v16i1_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmtst	v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_v16i1_v16i8:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    cmtst	v0.16b, v0.16b, v0.16b
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne <16 x i8> %x, zeroinitializer
+  ret <16 x i1> %cmp
+}
+
+define <2 x i8> @setcc_v2i8_v2i64(<2 x i64> %x) {
+; CHECK-SD-LABEL: setcc_v2i8_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmtst	v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    movi	v1.2s, #1
+; CHECK-SD-NEXT:    xtn	v0.2s, v0.2d
+; CHECK-SD-NEXT:    and	v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_v2i8_v2i64:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    cmtst	v0.2d, v0.2d, v0.2d
+; CHECK-CSSC-NEXT:    movi	v1.2s, #1
+; CHECK-CSSC-NEXT:    xtn	v0.2s, v0.2d
+; CHECK-CSSC-NEXT:    and	v0.8b, v0.8b, v1.8b
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne <2 x i64> %x, zeroinitializer
+  %conv = zext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %conv
+}
+
+define <4 x i16> @setcc_v4i16_v4i32(<4 x i32> %x) {
+; CHECK-SD-LABEL: setcc_v4i16_v4i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmtst	v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    movi	v1.4h, #1
+; CHECK-SD-NEXT:    xtn	v0.4h, v0.4s
+; CHECK-SD-NEXT:    and	v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_v4i16_v4i32:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    cmtst	v0.4s, v0.4s, v0.4s
+; CHECK-CSSC-NEXT:    movi	v1.4h, #1
+; CHECK-CSSC-NEXT:    xtn	v0.4h, v0.4s
+; CHECK-CSSC-NEXT:    and	v0.8b, v0.8b, v1.8b
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne <4 x i32> %x, zeroinitializer
+  %conv = zext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <4 x i32> @setcc_v4i32_v4i32(<4 x i32> %x) {
+; CHECK-SD-LABEL: setcc_v4i32_v4i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi	v1.4s, #1
+; CHECK-SD-NEXT:    cmeq	v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    bic	v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: setcc_v4i32_v4i32:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    movi	v1.4s, #1
+; CHECK-CSSC-NEXT:    cmeq	v0.4s, v0.4s, #0
+; CHECK-CSSC-NEXT:    bic	v0.16b, v1.16b, v0.16b
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %cmp = icmp ne <4 x i32> %x, zeroinitializer
+  %conv = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %conv
+}
+
+; auto icmpi128(int128 x0) { return x0 != 0; }
+define i1 @icmpi128(i128 noundef %0) {
+; CHECK-SD-LABEL: icmpi128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    orr	x8, x0, x1
+; CHECK-SD-NEXT:    cmp	x8, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: icmpi128:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    orr	x8, x0, x1
+; CHECK-CSSC-NEXT:    umin	x0, x8, #1
+; CHECK-CSSC-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-CSSC-NEXT:    ret
+;
+entry:
+  %2 = icmp ne i128 %0, 0
+  ret i1 %2
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index d06e42f5405ef..3d6cc814d157d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -414,8 +414,8 @@ define i1 @ctpop32_ne_one_nonzero(i32 %x) {
 ; CHECK-CSSC-LABEL: ctpop32_ne_one_nonzero:
 ; CHECK-CSSC:       // %bb.0: // %entry
 ; CHECK-CSSC-NEXT:    sub w8, w0, #1
-; CHECK-CSSC-NEXT:    tst w0, w8
-; CHECK-CSSC-NEXT:    cset w0, ne
+; CHECK-CSSC-NEXT:    and	w8, w0, w8
+; CHECK-CSSC-NEXT:    umin w0, w8, #1
 ; CHECK-CSSC-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: ctpop32_ne_one_nonzero:

From ee45ba2ff49db3cc5d31f2600a3e7fad81d70cb1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 27 Nov 2025 19:18:00 +0000
Subject: [PATCH 02/13] [AArch64] Use SVE for fixed-length bf16 operations with
 +sve-b16b16 (#169329)

This can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly
expansions).
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   8 +-
 .../AArch64/fixed-length-bf16-arith.ll        | 936 ++++++++++++++++++
 .../CodeGen/AArch64/sve-indexed-arithmetic.ll | 103 +-
 3 files changed, 960 insertions(+), 87 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 694f625c094d4..a1400f4cecdac 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1783,9 +1783,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+    }
 
-      if (Subtarget->hasSVEB16B16() &&
-          Subtarget->isNonStreamingSVEorSME2Available()) {
+    if (Subtarget->hasSVEB16B16() &&
+        Subtarget->isNonStreamingSVEorSME2Available()) {
+      // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
+      for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
+                      MVT::nxv8bf16}) {
         setOperationAction(ISD::FADD, VT, Custom);
         setOperationAction(ISD::FMA, VT, Custom);
         setOperationAction(ISD::FMAXIMUM, VT, Custom);
diff --git a/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
new file mode 100644
index 0000000000000..e6344b9eb89dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
@@ -0,0 +1,936 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16             < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FABS
+;
+
+define <4 x bfloat> @fabs_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: fabs_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.4h, #128, lsl #8
+; CHECK-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %a)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fabs_v8bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: fabs_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.8h, #128, lsl #8
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %a)
+  ret <8 x bfloat> %res
+}
+
+;
+; FADD
+;
+
+define <4 x bfloat> @fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v0.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fadd_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v2.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v0.4h, #16
+; NOB16B16-NEXT:    shll2 v1.4s, v1.8h, #16
+; NOB16B16-NEXT:    shll2 v0.4s, v0.8h, #16
+; NOB16B16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NOB16B16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v2.4s
+; NOB16B16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fadd_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
+
+;
+; FDIV
+;
+
+define <4 x bfloat> @fdiv_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %res = fdiv <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fdiv_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+  %res = fdiv <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
+
+;
+; FMAX
+;
+
+define <4 x bfloat> @fmax_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmax s2, s3, s2
+; NOB16B16-NEXT:    fmax s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmax s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmax s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmax_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmax_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmax s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmax s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmax s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmax s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmax s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmax s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmax s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmax s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmax_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMAXNM
+;
+
+define <4 x bfloat> @fmaxnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmaxnm s2, s3, s2
+; NOB16B16-NEXT:    fmaxnm s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmaxnm s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmaxnm s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmaxnm_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmaxnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmaxnm s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmaxnm s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmaxnm s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmaxnm s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmaxnm s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmaxnm s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmaxnm s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmaxnm s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmaxnm_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMIN
+;
+
+define <4 x bfloat> @fmin_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmin s2, s3, s2
+; NOB16B16-NEXT:    fmin s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmin s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmin s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmin_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmin_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmin s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmin s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmin s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmin s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmin s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmin s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmin s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmin s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmin_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMINNM
+;
+
+define <4 x bfloat> @fminnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fminnm s2, s3, s2
+; NOB16B16-NEXT:    fminnm s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fminnm s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fminnm s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fminnm_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fminnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fminnm s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fminnm s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fminnm s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fminnm s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fminnm s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fminnm s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fminnm s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fminnm s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fminnm_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMLA
+;
+
+define <4 x bfloat> @fmla_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d2 killed $d2 def $q2
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h3, v2.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[1]
+; NOB16B16-NEXT:    mov h5, v0.h[1]
+; NOB16B16-NEXT:    shll v6.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h17, v2.h[2]
+; NOB16B16-NEXT:    mov h18, v1.h[2]
+; NOB16B16-NEXT:    mov h19, v0.h[2]
+; NOB16B16-NEXT:    mov h2, v2.h[3]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    fmadd s6, s16, s7, s6
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h16, v0.h[3]
+; NOB16B16-NEXT:    shll v7.4s, v19.4h, #16
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmadd s3, s5, s4, s3
+; NOB16B16-NEXT:    shll v4.4s, v17.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v18.4h, #16
+; NOB16B16-NEXT:    bfcvt h0, s6
+; NOB16B16-NEXT:    fmadd s4, s7, s5, s4
+; NOB16B16-NEXT:    shll v5.4s, v16.4h, #16
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmadd s1, s5, s1, s2
+; NOB16B16-NEXT:    mov v0.h[1], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[2], v3.h[0]
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmla_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d2 killed $d2 def $z2
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmla_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h3, v2.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[1]
+; NOB16B16-NEXT:    mov h5, v0.h[1]
+; NOB16B16-NEXT:    shll v6.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h17, v2.h[2]
+; NOB16B16-NEXT:    mov h18, v1.h[2]
+; NOB16B16-NEXT:    mov h19, v0.h[2]
+; NOB16B16-NEXT:    mov h20, v2.h[3]
+; NOB16B16-NEXT:    mov h21, v1.h[3]
+; NOB16B16-NEXT:    fmadd s6, s16, s7, s6
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[3]
+; NOB16B16-NEXT:    shll v16.4s, v19.4h, #16
+; NOB16B16-NEXT:    mov h19, v0.h[4]
+; NOB16B16-NEXT:    fmadd s4, s5, s4, s3
+; NOB16B16-NEXT:    shll v3.4s, v17.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v18.4h, #16
+; NOB16B16-NEXT:    mov h17, v2.h[4]
+; NOB16B16-NEXT:    mov h18, v1.h[4]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v19.4s, v19.4h, #16
+; NOB16B16-NEXT:    fmadd s5, s16, s5, s3
+; NOB16B16-NEXT:    bfcvt h3, s6
+; NOB16B16-NEXT:    shll v6.4s, v20.4h, #16
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    shll v16.4s, v21.4h, #16
+; NOB16B16-NEXT:    shll v17.4s, v17.4h, #16
+; NOB16B16-NEXT:    shll v18.4s, v18.4h, #16
+; NOB16B16-NEXT:    fmadd s6, s7, s16, s6
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    mov h7, v1.h[5]
+; NOB16B16-NEXT:    mov v3.h[1], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v2.h[5]
+; NOB16B16-NEXT:    mov h16, v0.h[5]
+; NOB16B16-NEXT:    fmadd s17, s19, s18, s17
+; NOB16B16-NEXT:    mov h18, v2.h[6]
+; NOB16B16-NEXT:    mov h19, v1.h[6]
+; NOB16B16-NEXT:    mov h2, v2.h[7]
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    bfcvt h6, s6
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov v3.h[2], v5.h[0]
+; NOB16B16-NEXT:    mov h5, v0.h[6]
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmadd s4, s16, s7, s4
+; NOB16B16-NEXT:    mov v3.h[3], v6.h[0]
+; NOB16B16-NEXT:    bfcvt h6, s17
+; NOB16B16-NEXT:    shll v7.4s, v18.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v19.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmadd s5, s5, s16, s7
+; NOB16B16-NEXT:    mov v3.h[4], v6.h[0]
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    fmadd s0, s0, s1, s2
+; NOB16B16-NEXT:    mov v3.h[5], v4.h[0]
+; NOB16B16-NEXT:    bfcvt h4, s5
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v3.h[6], v4.h[0]
+; NOB16B16-NEXT:    mov v3.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v3.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmla_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q2 killed $q2 def $z2
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMUL
+;
+
+define <4 x bfloat> @fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v0.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmul_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fmul <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v2.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v0.4h, #16
+; NOB16B16-NEXT:    shll2 v1.4s, v1.8h, #16
+; NOB16B16-NEXT:    shll2 v0.4s, v0.8h, #16
+; NOB16B16-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; NOB16B16-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v2.4s
+; NOB16B16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmul_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fmul <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
+
+;
+; FNEG
+;
+
+define <4 x bfloat> @fneg_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: fneg_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4h, #128, lsl #8
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %res = fneg <4 x bfloat> %a
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fneg_v8bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: fneg_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.8h, #128, lsl #8
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %res = fneg <8 x bfloat> %a
+  ret <8 x bfloat> %res
+}
+
+;
+; FSQRT
+;
+
+define <4 x bfloat> @fsqrt_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    shll v2.4s, v0.4h, #16
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    fsqrt s2, s2
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fsqrt s1, s1
+; CHECK-NEXT:    bfcvt h1, s1
+; CHECK-NEXT:    fsqrt s3, s3
+; CHECK-NEXT:    fsqrt s4, s0
+; CHECK-NEXT:    bfcvt h0, s2
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    bfcvt h1, s3
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    bfcvt h1, s4
+; CHECK-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> %a)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fsqrt_v8bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    shll v2.4s, v0.4h, #16
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    mov h4, v0.h[3]
+; CHECK-NEXT:    mov h5, v0.h[4]
+; CHECK-NEXT:    mov h6, v0.h[5]
+; CHECK-NEXT:    mov h7, v0.h[6]
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fsqrt s2, s2
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v4.4s, v4.4h, #16
+; CHECK-NEXT:    shll v5.4s, v5.4h, #16
+; CHECK-NEXT:    shll v6.4s, v6.4h, #16
+; CHECK-NEXT:    shll v7.4s, v7.4h, #16
+; CHECK-NEXT:    shll v16.4s, v0.4h, #16
+; CHECK-NEXT:    bfcvt h0, s2
+; CHECK-NEXT:    fsqrt s1, s1
+; CHECK-NEXT:    bfcvt h1, s1
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    fsqrt s3, s3
+; CHECK-NEXT:    bfcvt h1, s3
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    fsqrt s4, s4
+; CHECK-NEXT:    bfcvt h1, s4
+; CHECK-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-NEXT:    fsqrt s5, s5
+; CHECK-NEXT:    bfcvt h1, s5
+; CHECK-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-NEXT:    fsqrt s6, s6
+; CHECK-NEXT:    bfcvt h1, s6
+; CHECK-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-NEXT:    fsqrt s7, s7
+; CHECK-NEXT:    bfcvt h1, s7
+; CHECK-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NEXT:    fsqrt s2, s16
+; CHECK-NEXT:    bfcvt h1, s2
+; CHECK-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a)
+  ret <8 x bfloat> %res
+}
+
+;
+; FSUB
+;
+
+define <4 x bfloat> @fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v0.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fsub_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfsub z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fsub <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fsub_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v2.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v0.4h, #16
+; NOB16B16-NEXT:    shll2 v1.4s, v1.8h, #16
+; NOB16B16-NEXT:    shll2 v0.4s, v0.8h, #16
+; NOB16B16-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; NOB16B16-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v2.4s
+; NOB16B16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fsub_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfsub z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fsub <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll b/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll
index bcf5063bdda04..eafa44a35d024 100644
--- a/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll
+++ b/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll
@@ -22,26 +22,13 @@ define void @fmul_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK-LABEL: fmul_indexed_bf16_256b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q2, q3, [x1]
 ; CHECK-NEXT:    dup v0.8h, v0.h[2]
 ; CHECK-NEXT:    dup v1.8h, v1.h[2]
-; CHECK-NEXT:    shll v4.4s, v2.4h, #16
-; CHECK-NEXT:    shll v6.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    shll v5.4s, v0.4h, #16
-; CHECK-NEXT:    shll v7.4s, v1.4h, #16
-; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
-; CHECK-NEXT:    fmul v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    fmul v5.4s, v6.4s, v7.4s
-; CHECK-NEXT:    fmul v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    bfcvtn v2.4h, v4.4s
-; CHECK-NEXT:    bfcvtn v3.4h, v5.4s
-; CHECK-NEXT:    bfcvtn2 v2.8h, v0.4s
-; CHECK-NEXT:    bfcvtn2 v3.8h, v1.4s
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    bfmul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    bfmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
   %ld.a = load <16 x bfloat>, ptr %a
   %ld.b = load <16 x bfloat>, ptr %b
@@ -124,43 +111,16 @@ define void @fmla_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK-LABEL: fmla_indexed_bf16_256b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEXT:    ldp q4, q5, [x2]
 ; CHECK-NEXT:    dup v0.8h, v0.h[2]
 ; CHECK-NEXT:    dup v1.8h, v1.h[2]
-; CHECK-NEXT:    shll v4.4s, v2.4h, #16
-; CHECK-NEXT:    shll v6.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    shll v5.4s, v0.4h, #16
-; CHECK-NEXT:    shll v7.4s, v1.4h, #16
-; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
-; CHECK-NEXT:    fmul v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    fmul v5.4s, v6.4s, v7.4s
-; CHECK-NEXT:    fmul v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    bfcvtn v2.4h, v4.4s
-; CHECK-NEXT:    bfcvtn v3.4h, v5.4s
-; CHECK-NEXT:    bfcvtn2 v2.8h, v0.4s
-; CHECK-NEXT:    bfcvtn2 v3.8h, v1.4s
-; CHECK-NEXT:    ldp q0, q1, [x2]
-; CHECK-NEXT:    shll v4.4s, v0.4h, #16
-; CHECK-NEXT:    shll v5.4s, v2.4h, #16
-; CHECK-NEXT:    shll v6.4s, v1.4h, #16
-; CHECK-NEXT:    shll v7.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fadd v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    bfcvtn v2.4h, v4.4s
-; CHECK-NEXT:    bfcvtn v3.4h, v5.4s
-; CHECK-NEXT:    bfcvtn2 v2.8h, v0.4s
-; CHECK-NEXT:    bfcvtn2 v3.8h, v1.4s
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    bfmul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    bfmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    bfadd z0.h, p0/m, z0.h, z4.h
+; CHECK-NEXT:    bfadd z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
   %ld.a = load <16 x bfloat>, ptr %a
   %ld.b = load <16 x bfloat>, ptr %b
@@ -251,43 +211,16 @@ define void @fmls_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK-LABEL: fmls_indexed_bf16_256b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEXT:    ldp q4, q5, [x2]
 ; CHECK-NEXT:    dup v0.8h, v0.h[2]
 ; CHECK-NEXT:    dup v1.8h, v1.h[2]
-; CHECK-NEXT:    shll v4.4s, v2.4h, #16
-; CHECK-NEXT:    shll v6.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    shll v5.4s, v0.4h, #16
-; CHECK-NEXT:    shll v7.4s, v1.4h, #16
-; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
-; CHECK-NEXT:    fmul v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    fmul v5.4s, v6.4s, v7.4s
-; CHECK-NEXT:    fmul v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    bfcvtn v2.4h, v4.4s
-; CHECK-NEXT:    bfcvtn v3.4h, v5.4s
-; CHECK-NEXT:    bfcvtn2 v2.8h, v0.4s
-; CHECK-NEXT:    bfcvtn2 v3.8h, v1.4s
-; CHECK-NEXT:    ldp q0, q1, [x2]
-; CHECK-NEXT:    shll v4.4s, v0.4h, #16
-; CHECK-NEXT:    shll v5.4s, v2.4h, #16
-; CHECK-NEXT:    shll v6.4s, v1.4h, #16
-; CHECK-NEXT:    shll v7.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    fsub v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    fsub v5.4s, v6.4s, v7.4s
-; CHECK-NEXT:    fsub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fsub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bfcvtn v2.4h, v4.4s
-; CHECK-NEXT:    bfcvtn v3.4h, v5.4s
-; CHECK-NEXT:    bfcvtn2 v2.8h, v0.4s
-; CHECK-NEXT:    bfcvtn2 v3.8h, v1.4s
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    bfmul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    bfmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    bfsub z0.h, p0/m, z0.h, z4.h
+; CHECK-NEXT:    bfsub z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
   %ld.a = load <16 x bfloat>, ptr %a
   %ld.b = load <16 x bfloat>, ptr %b

From 965c3d760de35753d97d41c6b69582cecff8819d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lukas=20D=C3=B6llerer?= <contact@lukas-doellerer.de>
Date: Thu, 27 Nov 2025 20:21:09 +0100
Subject: [PATCH 03/13] [lld][WebAssembly] Fix SEGFAULT when importing wrapped
 symbol (#169656)

When wrapping a symbol `foo` via `-wrap=foo`, we create the symbol
`__wrap_foo` that replaces all mentions of `foo`. This feature was
implemented for wasm-ld in commit a5ca34e.

So far, no valid signature has been attached to the undefined symbol,
leading to a nullptr dereference in the logic for creating the import
section. This change adds the correct signature to the wrapped symbol,
enabling the generation of an import for it.
---
 lld/test/wasm/wrap_import.s    | 32 ++++++++++++++++++++++++++++++++
 lld/wasm/Driver.cpp            |  8 +++++---
 lld/wasm/SyntheticSections.cpp |  4 +++-
 3 files changed, 40 insertions(+), 4 deletions(-)
 create mode 100644 lld/test/wasm/wrap_import.s

diff --git a/lld/test/wasm/wrap_import.s b/lld/test/wasm/wrap_import.s
new file mode 100644
index 0000000000000..ce3b6f57f10c4
--- /dev/null
+++ b/lld/test/wasm/wrap_import.s
@@ -0,0 +1,32 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o
+# RUN: wasm-ld -wrap nosuchsym -wrap foo -allow-undefined -o %t.wasm %t.o
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+.globl foo
+.globl _start
+
+foo:
+  .functype foo () -> ()
+  end_function
+
+_start:
+  .functype _start () -> ()
+  call  foo
+  end_function
+
+# CHECK:      - Type:            IMPORT
+# CHECK-NEXT:   Imports:
+# CHECK-NEXT:     - Module:          env
+# CHECK-NEXT:       Field:           __wrap_foo
+# CHECK-NEXT:       Kind:            FUNCTION
+# CHECK-NEXT        SigIndex:        0
+
+# CHECK:      - Type:            CODE
+# CHECK-NEXT:   Functions:
+# CHECK-NEXT:       Index:           1
+
+# CHECK:        FunctionNames:
+# CHECK-NEXT:      - Index:           0
+# CHECK-NEXT:        Name:            __wrap_foo
+# CHECK-NEXT:      - Index:           1
+# CHECK-NEXT:        Name:            _start
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index fac166587cb9b..97e50783985a8 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -1173,9 +1173,10 @@ struct WrappedSymbol {
   Symbol *wrap;
 };
 
-static Symbol *addUndefined(StringRef name) {
+static Symbol *addUndefined(StringRef name,
+                            const WasmSignature *signature = nullptr) {
   return symtab->addUndefinedFunction(name, std::nullopt, std::nullopt,
-                                      WASM_SYMBOL_UNDEFINED, nullptr, nullptr,
+                                      WASM_SYMBOL_UNDEFINED, nullptr, signature,
                                       false);
 }
 
@@ -1198,7 +1199,8 @@ static std::vector<WrappedSymbol> addWrappedSymbols(opt::InputArgList &args) {
       continue;
 
     Symbol *real = addUndefined(saver().save("__real_" + name));
-    Symbol *wrap = addUndefined(saver().save("__wrap_" + name));
+    Symbol *wrap =
+        addUndefined(saver().save("__wrap_" + name), sym->getSignature());
     v.push_back({sym, real, wrap});
 
     // We want to tell LTO not to inline symbols to be overwritten
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 399a5084e6595..5e7b9c229f3ed 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -196,7 +196,9 @@ void ImportSection::addImport(Symbol *sym) {
   StringRef module = sym->importModule.value_or(defaultModule);
   StringRef name = sym->importName.value_or(sym->getName());
   if (auto *f = dyn_cast<FunctionSymbol>(sym)) {
-    ImportKey<WasmSignature> key(*(f->getSignature()), module, name);
+    const WasmSignature *sig = f->getSignature();
+    assert(sig && "imported functions must have a signature");
+    ImportKey<WasmSignature> key(*sig, module, name);
     auto entry = importedFunctions.try_emplace(key, numImportedFunctions);
     if (entry.second) {
       importedSymbols.emplace_back(sym);

From a1f30c24ea2cf7d3acdd0f6eed19f737ae26b0d2 Mon Sep 17 00:00:00 2001
From: "Deric C." <cheung.deric@gmail.com>
Date: Thu, 27 Nov 2025 12:02:15 -0800
Subject: [PATCH 04/13] [NFC] [DirectX] Update DirectX codegen test
 `CBufferAccess/gep-ce-two-uses.ll` due to changes to ReplaceConstant
 (#169848)

Fixes an LLVM DirectX codegen test after it broke due to #169141

The CBuffer loads and GEPs are no longer duplicated when there are two
or more accesses within the same basic block.
This PR removes the duplicate check for CBuffer load and GEP from the
original test function `@f` and adds a new test function `@g` which
places duplicate CBuffer loads into separate basic blocks.
---
 .../DirectX/CBufferAccess/gep-ce-two-uses.ll     | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll
index 4eda6353f47ed..97896e598de90 100644
--- a/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll
@@ -21,6 +21,22 @@ entry:
   %a1 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 16), align 4
   store float %a1, ptr %dst, align 32
 
+  %a2 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 16), align 4
+  store float %a2, ptr %dst, align 32
+
+  ret void
+}
+
+; CHECK: define void @g
+define void @g(ptr %dst) {
+entry:
+  ; CHECK: [[PTR:%.*]] = call ptr addrspace(2) @llvm.dx.resource.getpointer.{{.*}}(target("dx.CBuffer", %__cblayout_CB) {{%.*}}, i32 0)
+  ; CHECK: getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR]], i32 16
+  %a1 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 16), align 4
+  store float %a1, ptr %dst, align 32
+  br label %next
+
+next:
   ; CHECK: [[PTR:%.*]] = call ptr addrspace(2) @llvm.dx.resource.getpointer.{{.*}}(target("dx.CBuffer", %__cblayout_CB) {{%.*}}, i32 0)
   ; CHECK: getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR]], i32 16
   %a2 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 16), align 4

From d39f5243f8df23392e1c493f7d607cd0074222b9 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Thu, 27 Nov 2025 15:22:00 -0500
Subject: [PATCH 05/13] [PowerPC] Implement paddis (#161572)

---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp |  4 ++
 .../PowerPC/MCTargetDesc/PPCAsmBackend.cpp    |  9 ++++
 .../PowerPC/MCTargetDesc/PPCFixupKinds.h      |  6 +++
 .../PowerPC/MCTargetDesc/PPCInstPrinter.cpp   | 11 +++++
 .../PowerPC/MCTargetDesc/PPCInstPrinter.h     |  2 +
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp |  1 +
 llvm/lib/Target/PowerPC/PPCInstrFuture.td     | 44 +++++++++++++++++++
 llvm/lib/Target/PowerPC/PPCRegisterInfo.td    | 19 ++++++++
 .../PowerPC/ppc-encoding-ISAFuture.txt        |  6 +++
 .../PowerPC/ppc64le-encoding-ISAFuture.txt    |  6 +++
 llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s | 12 +++++
 llvm/test/MC/PowerPC/ppc64-errors.s           |  5 +++
 12 files changed, 125 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 561a9c51b9cc2..b07f95018ca90 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -365,6 +365,10 @@ struct PPCOperand : public MCParsedAsmOperand {
   bool isS16ImmX4() const { return isExtImm<16>(/*Signed*/ true, 4); }
   bool isS16ImmX16() const { return isExtImm<16>(/*Signed*/ true, 16); }
   bool isS17Imm() const { return isExtImm<17>(/*Signed*/ true, 1); }
+  bool isS32Imm() const {
+    // TODO: Is ContextImmediate needed?
+    return Kind == Expression || isSImm<32>();
+  }
   bool isS34Imm() const {
     // Once the PC-Rel ABI is finalized, evaluate whether a 34-bit
     // ContextImmediate is needed.
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 04b886ae74993..558351b515a2e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -47,6 +47,9 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   case PPC::fixup_ppc_half16ds:
   case PPC::fixup_ppc_half16dq:
     return Value & 0xfffc;
+  case PPC::fixup_ppc_pcrel32:
+  case PPC::fixup_ppc_imm32:
+    return Value & 0xffffffff;
   case PPC::fixup_ppc_pcrel34:
   case PPC::fixup_ppc_imm34:
     return Value & 0x3ffffffff;
@@ -71,6 +74,8 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case PPC::fixup_ppc_br24abs:
   case PPC::fixup_ppc_br24_notoc:
     return 4;
+  case PPC::fixup_ppc_pcrel32:
+  case PPC::fixup_ppc_imm32:
   case PPC::fixup_ppc_pcrel34:
   case PPC::fixup_ppc_imm34:
   case FK_Data_8:
@@ -154,6 +159,8 @@ MCFixupKindInfo PPCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_ppc_brcond14abs", 16, 14, 0},
       {"fixup_ppc_half16", 0, 16, 0},
       {"fixup_ppc_half16ds", 0, 14, 0},
+      {"fixup_ppc_pcrel32", 0, 32, 0},
+      {"fixup_ppc_imm32", 0, 32, 0},
       {"fixup_ppc_pcrel34", 0, 34, 0},
       {"fixup_ppc_imm34", 0, 34, 0},
       {"fixup_ppc_nofixup", 0, 0, 0}};
@@ -166,6 +173,8 @@ MCFixupKindInfo PPCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_ppc_brcond14abs", 2, 14, 0},
       {"fixup_ppc_half16", 0, 16, 0},
       {"fixup_ppc_half16ds", 2, 14, 0},
+      {"fixup_ppc_pcrel32", 0, 32, 0},
+      {"fixup_ppc_imm32", 0, 32, 0},
       {"fixup_ppc_pcrel34", 0, 34, 0},
       {"fixup_ppc_imm34", 0, 34, 0},
       {"fixup_ppc_nofixup", 0, 0, 0}};
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index df0c666f5b113..4164b697649cd 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -40,6 +40,12 @@ enum Fixups {
   /// instrs like 'std'.
   fixup_ppc_half16ds,
 
+  // A 32-bit fixup corresponding to PC-relative paddis.
+  fixup_ppc_pcrel32,
+
+  // A 32-bit fixup corresponding to Non-PC-relative paddis.
+  fixup_ppc_imm32,
+
   // A 34-bit fixup corresponding to PC-relative paddi.
   fixup_ppc_pcrel34,
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index a2f981e861511..46d6093be3c17 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -430,6 +430,17 @@ void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
     printOperand(MI, OpNo, STI, O);
 }
 
+void PPCInstPrinter::printS32ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm()) {
+    long long Value = MI->getOperand(OpNo).getImm();
+    assert(isInt<32>(Value) && "Invalid s32imm argument!");
+    O << (long long)Value;
+  } else
+    printOperand(MI, OpNo, STI, O);
+}
+
 void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
index 01ff6255f2a03..2fbd06c5a96cf 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
@@ -80,6 +80,8 @@ class PPCInstPrinter : public MCInstPrinter {
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printS16ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printS32ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printS34ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 81d8e94b660d7..b28304b07e1a3 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -46,6 +46,7 @@ static void addFixup(SmallVectorImpl<MCFixup> &Fixups, uint32_t Offset,
   case PPC::fixup_ppc_br24_notoc:
   case PPC::fixup_ppc_brcond14:
   case PPC::fixup_ppc_pcrel34:
+  case PPC::fixup_ppc_pcrel32:
     PCRel = true;
   }
   Fixups.push_back(MCFixup::create(Offset, Value, Kind, PCRel));
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index e417ffe6d3677..39e6f4f139c11 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -312,9 +312,41 @@ class 8RR_XX4Form_XTABC6_P<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Inst{63} = XT{5};
 }
 
+class MLS_DForm_R_SI32_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                            InstrItinClass itin, list<dag> pattern>
+    : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<32> SI;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6...7} = 2;
+  let Inst{8} = 0;
+  let Inst{11} = PCRel;
+  let Inst{16...31} = SI{31...16};
+
+  // The instruction.
+  let Inst{38...42} = RT;
+  let Inst{43...47} = RA;
+  let Inst{48...63} = SI{15...0};
+}
+
+multiclass MLS_DForm_R_SI32_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
+                                   dag PCRel_IOL, string asmstr,
+                                   InstrItinClass itin> {
+  def NAME : MLS_DForm_R_SI32_RTA5<opcode, OOL, IOL, !strconcat(asmstr, ", 0"),
+                                   itin, []>;
+  def pc : MLS_DForm_R_SI32_RTA5<opcode, OOL, PCRel_IOL,
+                                 !strconcat(asmstr, ", 1"), itin, []>,
+           isPCRel;
+}
+
 //-------------------------- Instruction definitions -------------------------//
 // Predicate combinations available:
 // [IsISAFuture]
+// [IsISAFuture, PrefixInstrs]
 // [HasVSX, IsISAFuture]
 // [HasVSX, PrefixInstrs, IsISAFuture]
 
@@ -346,6 +378,18 @@ let Predicates = [IsISAFuture] in {
   }
 }
 
+let Predicates = [IsISAFuture, PrefixInstrs] in {
+  defm PADDIS : MLS_DForm_R_SI32_RTA5_p<15, (outs gprc:$RT),
+                                        (ins gprc_nor0:$RA, s32imm:$SI),
+                                        (ins immZero:$RA, s32imm_pcrel:$SI),
+                                        "paddis $RT, $RA, $SI", IIC_LdStLFD>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+    defm PADDIS8 : MLS_DForm_R_SI32_RTA5_p<15, (outs g8rc:$RT),
+                                           (ins g8rc_nox0:$RA, s32imm:$SI),
+                                           (ins immZero:$RA, s32imm_pcrel:$SI),
+                                           "paddis $RT, $RA, $SI", IIC_LdStLFD>;
+}
+
 let Predicates = [HasVSX, IsISAFuture] in {
   let mayLoad = 1 in {
     def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT),
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index d6b13680a057e..e23914a050359 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -808,6 +808,25 @@ def s17imm64 : Operand<i64> {
   let DecoderMethod = "decodeSImmOperand<16>";
   let OperandType = "OPERAND_IMMEDIATE";
 }
+def PPCS32ImmAsmOperand : AsmOperandClass {
+  let Name = "S32Imm";
+  let PredicateMethod = "isS32Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s32imm : Operand<i64> {
+  let PrintMethod = "printS32ImmOperand";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_imm32>";
+  let ParserMatchClass = PPCS32ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<32>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def s32imm_pcrel : Operand<i64> {
+  let PrintMethod = "printS32ImmOperand";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_pcrel32>";
+  let ParserMatchClass = PPCS32ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<32>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
 def PPCS34ImmAsmOperand : AsmOperandClass {
   let Name = "S34Imm";
   let PredicateMethod = "isS34Imm";
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index b27a50d93f5b9..1024c6b546c4a 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -298,6 +298,12 @@
 #CHECK: mtlpl 3, 4
 0x7c,0x80,0x1a,0x26
 
+#CHECK: paddis 10, 12, 1000000000, 0
+0x06,0x00,0x3b,0x9a,0x3d,0x4c,0xca,0x00
+
+#CHECK: paddis 10, 0, 1000000000, 1
+0x06,0x10,0x3b,0x9a,0x3d,0x40,0xca,0x00
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0xed,0x03,0x22,0x08
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 72662d9736740..bda8d1e69442f 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -292,6 +292,12 @@
 #CHECK: mtlpl 3, 4
 0x26,0x1a,0x80,0x7c
 
+#CHECK: paddis 10, 12, 1000000000, 0
+0x9a,0x3b,0x00,0x06,0x00,0xca,0x4c,0x3d
+
+#CHECK: paddis 10, 0, 1000000000, 1
+0x9a,0x3b,0x10,0x06,0x00,0xca,0x40,0x3d
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0x08,0x22,0x03,0xed
 
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index ab72649fc3404..eb616a15500f1 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -419,6 +419,18 @@
 #CHECK-BE: mtlpl 3, 4                     # encoding: [0x7c,0x80,0x1a,0x26]
 #CHECK-LE: mtlpl 3, 4                     # encoding: [0x26,0x1a,0x80,0x7c]
 
+           paddis 10, 12, 1000000000, 0
+#CHECK-BE: paddis 10, 12, 1000000000, 0   # encoding: [0x06,0x00,0x3b,0x9a,
+#CHECK-BE-SAME:                                        0x3d,0x4c,0xca,0x00]
+#CHECK-LE: paddis 10, 12, 1000000000, 0   # encoding: [0x9a,0x3b,0x00,0x06,
+#CHECK-LE-SAME:                                        0x00,0xca,0x4c,0x3d]
+
+           paddis 10, 0, 1000000000, 1
+#CHECK-BE: paddis 10, 0, 1000000000, 1    # encoding: [0x06,0x10,0x3b,0x9a,
+#CHECK-BE-SAME:                                        0x3d,0x40,0xca,0x00]
+#CHECK-LE: paddis 10, 0, 1000000000, 1    # encoding: [0x9a,0x3b,0x10,0x06,
+#CHECK-LE-SAME:                                        0x00,0xca,0x40,0x3d]
+
            xxmulmul 8, 3, 4, 2
 #CHECK-BE: xxmulmul 8, 3, 4, 2          # encoding: [0xed,0x03,0x22,0x08]
 #CHECK-LE: xxmulmul 8, 3, 4, 2          # encoding: [0x08,0x22,0x03,0xed]
diff --git a/llvm/test/MC/PowerPC/ppc64-errors.s b/llvm/test/MC/PowerPC/ppc64-errors.s
index 8598174300e42..4d4da58f650fe 100644
--- a/llvm/test/MC/PowerPC/ppc64-errors.s
+++ b/llvm/test/MC/PowerPC/ppc64-errors.s
@@ -4,6 +4,11 @@
 # RUN: not llvm-mc -triple powerpc64le-unknown-unknown < %s 2> %t
 # RUN: FileCheck < %t %s
 
+# From ISAFuture
+
+# CHECK: error: invalid operand for instruction
+paddis 10, 5, 1000000000, 1
+
 # From ISA31
 
 # CHECK: error: invalid operand for instruction

From ad605bdad7bb36bc74b9fa5f8b3786081dac4ec6 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <moksyakov.alexey@huawei.com>
Date: Thu, 27 Nov 2025 23:48:10 +0300
Subject: [PATCH 06/13] [bolt][aarch64] Change indirect call instrumentation
 snippet

Indirect call instrumentation snippet uses x16 register in exit
handler to go to destination target

    __bolt_instr_ind_call_handler_func:
            msr  nzcv, x1
            ldp  x0, x1, [sp], llvm#16
            ldr  x16, [sp], llvm#16
            ldp  x0, x1, [sp], llvm#16
            br   x16	<-----

This patch adds the instrumentation snippet by calling instrumentation
runtime library through indirect call instruction and adding the wrapper
to store/load target value and the register for original indirect instruction.

Example:
            mov x16, foo

    infirectCall:
            adrp x8, Label
            add  x8, x8, #:lo12:Label
            blr x8

Before:

    Instrumented indirect call:
            stp     x0, x1, [sp, #-16]!
            mov     x0, x8
            movk    x1, #0x0, lsl llvm#48
            movk    x1, #0x0, lsl llvm#32
            movk    x1, #0x0, lsl llvm#16
            movk    x1, #0x0
            stp     x0, x1, [sp, #-16]!
            adrp    x0, __bolt_instr_ind_call_handler_func
            add     x0, x0, #:lo12:__bolt_instr_ind_call_handler_func
            blr     x0

    __bolt_instr_ind_call_handler:  (exit snippet)
            msr     nzcv, x1
            ldp     x0, x1, [sp], llvm#16
            ldr     x16, [sp], llvm#16
            ldp     x0, x1, [sp], llvm#16
            br      x16    <- overwrites the original value in X16

    __bolt_instr_ind_call_handler_func:  (entry snippet)
            stp     x0, x1, [sp, #-16]!
            mrs     x1, nzcv
            adrp    x0, __bolt_instr_ind_call_handler
            add     x0, x0, x0, #:lo12:__bolt_instr_ind_call_handler
            ldr     x0, [x0]
            cmp     x0, #0x0
            b.eq    __bolt_instr_ind_call_handler
            str     x30, [sp, #-16]!
            blr     x0     <--- runtime lib store/load all regs
            ldr     x30, [sp], llvm#16
            b       __bolt_instr_ind_call_handler

_________________________________________________________________________

After:

            mov     x16, foo
    infirectCall:
            adrp    x8, Label
            add     x8, x8, #:lo12:Label
            blr     x8

    Instrumented indirect call:
            stp     x0, x1, [sp, #-16]!
            mov     x0, x8
            movk    x1, #0x0, lsl llvm#48
            movk    x1, #0x0, lsl llvm#32
            movk    x1, #0x0, lsl llvm#16
            movk    x1, #0x0
            stp     x0, x30, [sp, #-16]!
            adrp    x8, __bolt_instr_ind_call_handler_func
            add     x8, x8, #:lo12:__bolt_instr_ind_call_handler_func
            blr     x8       <--- call trampoline instr lib
            ldp     x0, x30, [sp], llvm#16
            mov     x8, x0   <---- restore original target
            ldp     x0, x1, [sp], llvm#16
            blr     x8       <--- original indirect call instruction

    // don't touch regs besides x0, x1
    __bolt_instr_ind_call_handler:  (exit snippet)
            ret     <---- return to original function with indirect call

    __bolt_instr_ind_call_handler_func: (entry snippet)
            adrp    x0, __bolt_instr_ind_call_handler
            add     x0, x0, #:lo12:__bolt_instr_ind_call_handler
            ldr     x0, [x0]
            cmp     x0, #0x0
            b.eq    __bolt_instr_ind_call_handler
            str     x30, [sp, #-16]!
            blr     x0     <--- runtime lib store/load all regs
            ldr     x30, [sp], llvm#16
            b       __bolt_instr_ind_call_handler
---
 bolt/include/bolt/Core/MCPlusBuilder.h        |   5 +
 bolt/lib/Passes/Instrumentation.cpp           |   9 +-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 141 +++++++++++-------
 bolt/runtime/instr.cpp                        |  10 +-
 bolt/runtime/sys_aarch64.h                    |   6 +-
 .../AArch64/instrumentation-ind-call.c        |  56 ++++++-
 6 files changed, 164 insertions(+), 63 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index a318ef0b6bd68..c8f4e2aa8c580 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -538,6 +538,11 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
+  virtual void createDirectBranch(MCInst &Inst, const MCSymbol *Target,
+                                  MCContext *Ctx) {
+    llvm_unreachable("not implemented");
+  }
+
   virtual MCPhysReg getX86R11() const { llvm_unreachable("not implemented"); }
 
   virtual unsigned getShortBranchOpcode(unsigned Opcode) const {
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index 150461b020f06..10479f35d8f9d 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -305,9 +305,12 @@ void Instrumentation::instrumentIndirectTarget(BinaryBasicBlock &BB,
                  : IndCallHandlerExitBBFunction->getSymbol(),
       IndCallSiteID, &*BC.Ctx);
 
-  Iter = BB.eraseInstruction(Iter);
-  Iter = insertInstructions(CounterInstrs, BB, Iter);
-  --Iter;
+  if (!BC.isAArch64()) {
+    Iter = BB.eraseInstruction(Iter);
+    Iter = insertInstructions(CounterInstrs, BB, Iter);
+    --Iter;
+  } else
+    Iter = insertInstructions(CounterInstrs, BB, Iter);
 }
 
 bool Instrumentation::instrumentOneTarget(
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index af87d5c12b5ce..dc7644fbabdcf 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -48,14 +48,14 @@ static cl::opt<bool> NoLSEAtomics(
 
 namespace {
 
-static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
+[[maybe_unused]] static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
   Inst.setOpcode(AArch64::MRS);
   Inst.clear();
   Inst.addOperand(MCOperand::createReg(RegName));
   Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));
 }
 
-static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) {
+[[maybe_unused]] static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) {
   Inst.setOpcode(AArch64::MSR);
   Inst.clear();
   Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));
@@ -2114,6 +2114,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       convertJmpToTailCall(Inst);
   }
 
+  void createDirectBranch(MCInst &Inst, const MCSymbol *Target,
+                          MCContext *Ctx) override {
+    Inst.setOpcode(AArch64::B);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
+        Inst, MCSymbolRefExpr::create(Target, *Ctx), *Ctx, 0)));
+  }
+
   bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
                      const MCSymbol *&TBB, const MCSymbol *&FBB,
                      MCInst *&CondBranch,
@@ -2471,21 +2479,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   }
 
   InstructionListType createInstrumentedIndCallHandlerExitBB() const override {
-    InstructionListType Insts(5);
     // Code sequence for instrumented indirect call handler:
-    //   msr  nzcv, x1
-    //   ldp  x0, x1, [sp], #16
-    //   ldr  x16, [sp], #16
-    //   ldp  x0, x1, [sp], #16
-    //   br   x16
-    setSystemFlag(Insts[0], AArch64::X1);
-    createPopRegisters(Insts[1], AArch64::X0, AArch64::X1);
-    // Here we load address of the next function which should be called in the
-    // original binary to X16 register. Writing to X16 is permitted without
-    // needing to restore.
-    loadReg(Insts[2], AArch64::X16, AArch64::SP);
-    createPopRegisters(Insts[3], AArch64::X0, AArch64::X1);
-    createIndirectBranch(Insts[4], AArch64::X16, 0);
+    //   ret
+
+    InstructionListType Insts;
+
+    Insts.emplace_back();
+    createReturn(Insts.back());
+
     return Insts;
   }
 
@@ -2561,39 +2562,59 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                                                      MCSymbol *HandlerFuncAddr,
                                                      int CallSiteID,
                                                      MCContext *Ctx) override {
-    InstructionListType Insts;
     // Code sequence used to enter indirect call instrumentation helper:
-    //   stp x0, x1, [sp, #-16]! createPushRegisters
-    //   mov target x0  convertIndirectCallToLoad -> orr x0 target xzr
+    //   stp x0, x1, [sp, #-16]! createPushRegisters  (1)
+    //   mov target, x0  convertIndirectCallToLoad -> orr x0 target xzr
     //   mov x1 CallSiteID createLoadImmediate ->
     //   movk    x1, #0x0, lsl #48
     //   movk    x1, #0x0, lsl #32
     //   movk    x1, #0x0, lsl #16
     //   movk    x1, #0x0
-    //   stp x0, x1, [sp, #-16]!
-    //   bl *HandlerFuncAddr createIndirectCall ->
+    //   stp x0, x30, [sp, #-16]!    (2)
     //   adr x0 *HandlerFuncAddr -> adrp + add
-    //   blr x0
+    //   blr x0   (__bolt_instr_ind_call_handler_func)
+    //   ldp x0, x30, [sp], #16   (2)
+    //   mov x0, target ; move target address to used register
+    //   ldp x0, x1, [sp], #16   (1)
+
+    InstructionListType Insts;
     Insts.emplace_back();
-    createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+    createPushRegisters(Insts.back(), getIntArgRegister(0),
+                        getIntArgRegister(1));
     Insts.emplace_back(CallInst);
-    convertIndirectCallToLoad(Insts.back(), AArch64::X0);
+    convertIndirectCallToLoad(Insts.back(), getIntArgRegister(0));
     InstructionListType LoadImm =
         createLoadImmediate(getIntArgRegister(1), CallSiteID);
     Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end());
     Insts.emplace_back();
-    createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+    createPushRegisters(Insts.back(), getIntArgRegister(0), AArch64::LR);
     Insts.resize(Insts.size() + 2);
-    InstructionListType Addr =
-        materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0);
+    InstructionListType Addr = materializeAddress(
+        HandlerFuncAddr, Ctx, CallInst.getOperand(0).getReg());
     assert(Addr.size() == 2 && "Invalid Addr size");
     std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
+
     Insts.emplace_back();
-    createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0);
+    createIndirectCallInst(Insts.back(), false,
+                           CallInst.getOperand(0).getReg());
 
-    // Carry over metadata including tail call marker if present.
-    stripAnnotations(Insts.back());
-    moveAnnotations(std::move(CallInst), Insts.back());
+    Insts.emplace_back();
+    createPopRegisters(Insts.back(), getIntArgRegister(0), AArch64::LR);
+
+    // move x0 to indirect call register
+    Insts.emplace_back();
+    Insts.back().setOpcode(AArch64::ORRXrs);
+    Insts.back().insert(Insts.back().begin(),
+                        MCOperand::createReg(CallInst.getOperand(0).getReg()));
+    Insts.back().insert(Insts.back().begin() + 1,
+                        MCOperand::createReg(AArch64::XZR));
+    Insts.back().insert(Insts.back().begin() + 2,
+                        MCOperand::createReg(getIntArgRegister(0)));
+    Insts.back().insert(Insts.back().begin() + 3, MCOperand::createImm(0));
+
+    Insts.emplace_back();
+    createPopRegisters(Insts.back(), getIntArgRegister(0),
+                       getIntArgRegister(1));
 
     return Insts;
   }
@@ -2602,12 +2623,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline,
                                           const MCSymbol *IndCallHandler,
                                           MCContext *Ctx) override {
-    // Code sequence used to check whether InstrTampoline was initialized
+    // Code sequence used to check whether InstrTrampoline was initialized
     // and call it if so, returns via IndCallHandler
-    //   stp     x0, x1, [sp, #-16]!
-    //   mrs     x1, nzcv
-    //   adr     x0, InstrTrampoline -> adrp + add
-    //   ldr     x0, [x0]
+    //   adrp    x0, InstrTrampoline
+    //   ldr     x0, [x0, #lo12:InstrTrampoline]
     //   subs    x0, x0, #0x0
     //   b.eq    IndCallHandler
     //   str     x30, [sp, #-16]!
@@ -2615,30 +2634,42 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     //   ldr     x30, [sp], #16
     //   b       IndCallHandler
     InstructionListType Insts;
+
+    // load handler address
+    MCInst InstAdrp;
+    InstAdrp.setOpcode(AArch64::ADRP);
+    InstAdrp.addOperand(MCOperand::createReg(getIntArgRegister(0)));
+    InstAdrp.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstAdrp, /* OpNum */ 1, InstrTrampoline,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_ADR_GOT_PAGE);
+    Insts.emplace_back(InstAdrp);
+
+    MCInst InstLoad;
+    InstLoad.setOpcode(AArch64::LDRXui);
+    InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0)));
+    InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0)));
+    InstLoad.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstLoad, /* OpNum */ 2, InstrTrampoline,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_LD64_GOT_LO12_NC);
+    Insts.emplace_back(InstLoad);
+
+    InstructionListType CmpJmp =
+        createCmpJE(getIntArgRegister(0), 0, IndCallHandler, Ctx);
+    Insts.insert(Insts.end(), CmpJmp.begin(), CmpJmp.end());
+
     Insts.emplace_back();
-    createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
-    Insts.emplace_back();
-    getSystemFlag(Insts.back(), getIntArgRegister(1));
-    Insts.emplace_back();
-    Insts.emplace_back();
-    InstructionListType Addr =
-        materializeAddress(InstrTrampoline, Ctx, AArch64::X0);
-    std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
-    assert(Addr.size() == 2 && "Invalid Addr size");
-    Insts.emplace_back();
-    loadReg(Insts.back(), AArch64::X0, AArch64::X0);
-    InstructionListType cmpJmp =
-        createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx);
-    Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end());
-    Insts.emplace_back();
-    storeReg(Insts.back(), AArch64::LR, AArch64::SP);
+    storeReg(Insts.back(), AArch64::LR, getSpRegister(/*Size*/ 8));
+
     Insts.emplace_back();
     Insts.back().setOpcode(AArch64::BLR);
-    Insts.back().addOperand(MCOperand::createReg(AArch64::X0));
+    Insts.back().addOperand(MCOperand::createReg(getIntArgRegister(0)));
+
     Insts.emplace_back();
-    loadReg(Insts.back(), AArch64::LR, AArch64::SP);
+    loadReg(Insts.back(), AArch64::LR, getSpRegister(/*Size*/ 8));
+
     Insts.emplace_back();
-    createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true);
+    createDirectBranch(Insts.back(), IndCallHandler, Ctx);
+
     return Insts;
   }
 
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index f586db2b0f9ba..634ade6bdd407 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -1691,9 +1691,12 @@ instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
 extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
 {
 #if defined(__aarch64__)
+  // the target address is placed on stack
+  // the identifier of the indirect call site is placed in X1 register
+
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #288]\n"
+                       "ldr x0, [sp, #272]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"
@@ -1728,9 +1731,12 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
 extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
 {
 #if defined(__aarch64__)
+  // the target address is placed on stack
+  // the identifier of the indirect call site is placed in X1 register
+
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #288]\n"
+                       "ldr x0, [sp, #272]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"
diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h
index b1d04f9d558e0..9cb8e022f58df 100644
--- a/bolt/runtime/sys_aarch64.h
+++ b/bolt/runtime/sys_aarch64.h
@@ -18,10 +18,12 @@
   "stp x24, x25, [sp, #-16]!\n"                                                \
   "stp x26, x27, [sp, #-16]!\n"                                                \
   "stp x28, x29, [sp, #-16]!\n"                                                \
-  "str x30, [sp,#-16]!\n"
+  "mrs x29, nzcv\n"                                                            \
+  "stp x29, x30, [sp, #-16]!\n"
 // Mirrors SAVE_ALL
 #define RESTORE_ALL                                                            \
-  "ldr x30, [sp], #16\n"                                                       \
+  "ldp x29, x30, [sp], #16\n"                                                  \
+  "msr nzcv, x29\n"                                                            \
   "ldp x28, x29, [sp], #16\n"                                                  \
   "ldp x26, x27, [sp], #16\n"                                                  \
   "ldp x24, x25, [sp], #16\n"                                                  \
diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
index f9056da333b4e..eddecba4d8b52 100644
--- a/bolt/test/runtime/AArch64/instrumentation-ind-call.c
+++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
@@ -15,9 +15,63 @@ int main() {
 REQUIRES: system-linux,bolt-runtime
 
 RUN: %clang %cflags %s -o %t.exe -Wl,-q -no-pie -fpie
+RUN: llvm-objdump --disassemble-symbols=main %t.exe \
+RUN:   | FileCheck %s --check-prefix=CHECKINDIRECTREG
+
+CHECKINDIRECTREG: mov w0, #0xa
+CHECKINDIRECTREG-NEXT: mov w1, #0x14
+CHECKINDIRECTREG-NEXT: blr x8
 
 RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
-RUN:   -o %t.instrumented
+RUN:   -o %t.instrumented \
+RUN:   | FileCheck %s --check-prefix=CHECK-INSTR-LOG
+
+CHECK-INSTR-LOG: BOLT-INSTRUMENTER: Number of indirect call site descriptors: 1
+
+RUN: llvm-objdump --disassemble-symbols=main %t.instrumented \
+RUN:   | FileCheck %s --check-prefix=CHECK-INSTR-INDIRECTREG
+
+RUN: llvm-objdump --disassemble-symbols=__bolt_instr_ind_call_handler \
+RUN:   %t.instrumented | FileCheck %s --check-prefix=CHECK-INSTR-INDIR-CALL
+RUN: llvm-objdump --disassemble-symbols=__bolt_instr_ind_call_handler_func \
+RUN:   %t.instrumented | FileCheck %s --check-prefix=CHECK-INSTR-INDIR-CALL-FUNC
+
+CHECK-INSTR-INDIRECTREG: mov w0, #0xa
+CHECK-INSTR-INDIRECTREG-NEXT: mov w1, #0x14
+// store current values
+CHECK-INSTR-INDIRECTREG-NEXT: stp x0, x1, {{.*}}
+// store the indirect target address in x0
+CHECK-INSTR-INDIRECTREG-NEXT: mov x0, x8
+// load callsite id into x1
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: stp x0, x30, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: adrp x8, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: add x8, {{.*}}
+// call instrumentation library handler function
+CHECK-INSTR-INDIRECTREG-NEXT: blr x8
+// restore registers saved before
+CHECK-INSTR-INDIRECTREG-NEXT: ldp x0, x30, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: mov x8, x0
+CHECK-INSTR-INDIRECTREG-NEXT: ldp x0, x1, {{.*}}
+// original indirect call instruction
+CHECK-INSTR-INDIRECTREG-NEXT: blr x8
+
+
+CHECK-INSTR-INDIR-CALL: __bolt_instr_ind_call_handler>:
+CHECK-INSTR-INDIR-CALL-NEXT: ret
+
+CHECK-INSTR-INDIR-CALL-FUNC: __bolt_instr_ind_call_handler_func>:
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: adrp x0
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: ldr x0
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: cmp x0, #0x0
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: b.eq{{.*}}__bolt_instr_ind_call_handler
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: str x30
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: blr x0
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: ldr x30
+CHECK-INSTR-INDIR-CALL-FUNC-NEXT: b{{.*}}__bolt_instr_ind_call_handler
 
 # Instrumented program needs to finish returning zero
 RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT

From 2e655c23deec6076f0198fafbe5c2b1deb6d5b98 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 27 Nov 2025 22:07:39 +0100
Subject: [PATCH 07/13] [CIR] Upstream TryCallOp (#165303)

Upstream TryCall Op as a prerequisite for Try Catch work

Issue https://github.com/llvm/llvm-project/issues/154992
---
 .../clang/CIR/Dialect/IR/CIRDialect.td        |   1 +
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  92 +++++++++++++++-
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 104 ++++++++++++++++--
 clang/test/CIR/IR/try-call.cir                |  35 ++++++
 4 files changed, 224 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CIR/IR/try-call.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index e91537186df59..34df9af7fc06d 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -44,6 +44,7 @@ def CIR_Dialect : Dialect {
     static llvm::StringRef getModuleLevelAsmAttrName() { return "cir.module_asm"; }
     static llvm::StringRef getGlobalCtorsAttrName() { return "cir.global_ctors"; }
     static llvm::StringRef getGlobalDtorsAttrName() { return "cir.global_dtors"; }
+    static llvm::StringRef getOperandSegmentSizesAttrName() { return "operandSegmentSizes"; }
 
     void registerAttributes();
     void registerTypes();
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 777b49434f119..5f5fab6f12300 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2728,7 +2728,7 @@ def CIR_LLVMIntrinsicCallOp : CIR_Op<"call_llvm_intrinsic"> {
 }
 
 //===----------------------------------------------------------------------===//
-// CallOp
+// CallOp and TryCallOp
 //===----------------------------------------------------------------------===//
 
 def CIR_SideEffect : CIR_I32EnumAttr<
@@ -2855,6 +2855,96 @@ def CIR_CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> {
   ];
 }
 
+def CIR_TryCallOp : CIR_CallOpBase<"try_call",[
+  Terminator
+]> {
+  let summary = "try_call operation";
+  let description = [{
+    Similar to `cir.call` but requires two destination blocks,
+    one which is used if the call returns without throwing an
+    exception (the "normal" destination) and another which is used
+    if an exception is thrown (the "unwind" destination). 
+
+    This operation is used only after the CFG flatterning pass.
+
+    Example:
+
+    ```mlir
+    // Before CFG flattening
+    cir.try {
+      %call = cir.call @division(%a, %b) : () -> !s32i
+      cir.yield
+    } catch all {
+      cir.yield
+    }
+
+    // After CFG flattening
+    %call = cir.try_call @division(%a, %b) ^normalDest, ^unwindDest
+      : (f32, f32) -> f32
+    ^normalDest:
+      cir.br ^afterTryBlock
+    ^unwindDest:
+      %exception_ptr, %type_id = cir.eh.inflight_exception
+      cir.br ^catchHandlerBlock(%exception_ptr : !cir.ptr<!void>)
+    ^catchHandlerBlock:
+      ...
+    ```
+  }];
+
+  let arguments = commonArgs;
+  let results = (outs Optional<CIR_AnyType>:$result);
+  let successors = (successor 
+    AnySuccessor:$normalDest,
+    AnySuccessor:$unwindDest
+  );
+
+  let skipDefaultBuilders = 1;
+  let hasLLVMLowering = false;
+
+  let builders = [
+    OpBuilder<(ins "mlir::SymbolRefAttr":$callee,
+                "mlir::Type":$resType,
+               "mlir::Block *":$normalDest,
+               "mlir::Block *":$unwindDest,
+               CArg<"mlir::ValueRange", "{}">:$callOperands,
+               CArg<"SideEffect", "SideEffect::All">:$sideEffect), [{
+      $_state.addOperands(callOperands);
+
+      if (callee)
+        $_state.addAttribute("callee", callee);
+      if (resType && !isa<VoidType>(resType))
+        $_state.addTypes(resType);
+
+      $_state.addAttribute("side_effect",
+        SideEffectAttr::get($_builder.getContext(), sideEffect));
+
+      // Handle branches
+      $_state.addSuccessors(normalDest);
+      $_state.addSuccessors(unwindDest);
+    }]>,
+    OpBuilder<(ins "mlir::Value":$ind_target,
+               "FuncType":$fn_type,
+               "mlir::Block *":$normalDest,
+               "mlir::Block *":$unwindDest,
+               CArg<"mlir::ValueRange", "{}">:$callOperands,
+               CArg<"SideEffect", "SideEffect::All">:$sideEffect), [{
+      ::llvm::SmallVector<mlir::Value, 4> finalCallOperands({ind_target});
+      finalCallOperands.append(callOperands.begin(), callOperands.end());
+      $_state.addOperands(finalCallOperands);
+
+      if (!fn_type.hasVoidReturn())
+        $_state.addTypes(fn_type.getReturnType());
+
+      $_state.addAttribute("side_effect",
+        SideEffectAttr::get($_builder.getContext(), sideEffect));
+
+      // Handle branches
+      $_state.addSuccessors(normalDest);
+      $_state.addSuccessors(unwindDest);
+    }]>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // AwaitOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index f1bacff7fc691..d505ca141d383 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -721,8 +721,28 @@ unsigned cir::CallOp::getNumArgOperands() {
   return this->getOperation()->getNumOperands();
 }
 
+static mlir::ParseResult
+parseTryCallDestinations(mlir::OpAsmParser &parser,
+                         mlir::OperationState &result) {
+  mlir::Block *normalDestSuccessor;
+  if (parser.parseSuccessor(normalDestSuccessor))
+    return mlir::failure();
+
+  if (parser.parseComma())
+    return mlir::failure();
+
+  mlir::Block *unwindDestSuccessor;
+  if (parser.parseSuccessor(unwindDestSuccessor))
+    return mlir::failure();
+
+  result.addSuccessors(normalDestSuccessor);
+  result.addSuccessors(unwindDestSuccessor);
+  return mlir::success();
+}
+
 static mlir::ParseResult parseCallCommon(mlir::OpAsmParser &parser,
-                                         mlir::OperationState &result) {
+                                         mlir::OperationState &result,
+                                         bool hasDestinationBlocks = false) {
   llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> ops;
   llvm::SMLoc opsLoc;
   mlir::FlatSymbolRefAttr calleeAttr;
@@ -749,6 +769,11 @@ static mlir::ParseResult parseCallCommon(mlir::OpAsmParser &parser,
   if (parser.parseRParen())
     return mlir::failure();
 
+  if (hasDestinationBlocks &&
+      parseTryCallDestinations(parser, result).failed()) {
+    return ::mlir::failure();
+  }
+
   if (parser.parseOptionalKeyword("nothrow").succeeded())
     result.addAttribute(CIRDialect::getNoThrowAttrName(),
                         mlir::UnitAttr::get(parser.getContext()));
@@ -788,7 +813,9 @@ static void printCallCommon(mlir::Operation *op,
                             mlir::FlatSymbolRefAttr calleeSym,
                             mlir::Value indirectCallee,
                             mlir::OpAsmPrinter &printer, bool isNothrow,
-                            cir::SideEffect sideEffect) {
+                            cir::SideEffect sideEffect,
+                            mlir::Block *normalDest = nullptr,
+                            mlir::Block *unwindDest = nullptr) {
   printer << ' ';
 
   auto callLikeOp = mlir::cast<cir::CIRCallOpInterface>(op);
@@ -802,8 +829,18 @@ static void printCallCommon(mlir::Operation *op,
     assert(indirectCallee);
     printer << indirectCallee;
   }
+
   printer << "(" << ops << ")";
 
+  if (normalDest) {
+    assert(unwindDest && "expected two successors");
+    auto tryCall = cast<cir::TryCallOp>(op);
+    printer << ' ' << tryCall.getNormalDest();
+    printer << ",";
+    printer << ' ';
+    printer << tryCall.getUnwindDest();
+  }
+
   if (isNothrow)
     printer << " nothrow";
 
@@ -813,11 +850,11 @@ static void printCallCommon(mlir::Operation *op,
     printer << ")";
   }
 
-  printer.printOptionalAttrDict(op->getAttrs(),
-                                {CIRDialect::getCalleeAttrName(),
-                                 CIRDialect::getNoThrowAttrName(),
-                                 CIRDialect::getSideEffectAttrName()});
-
+  llvm::SmallVector<::llvm::StringRef> elidedAttrs = {
+      CIRDialect::getCalleeAttrName(), CIRDialect::getNoThrowAttrName(),
+      CIRDialect::getSideEffectAttrName(),
+      CIRDialect::getOperandSegmentSizesAttrName()};
+  printer.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
   printer << " : ";
   printer.printFunctionalType(op->getOperands().getTypes(),
                               op->getResultTypes());
@@ -898,6 +935,59 @@ cir::CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   return verifyCallCommInSymbolUses(*this, symbolTable);
 }
 
+//===----------------------------------------------------------------------===//
+// TryCallOp
+//===----------------------------------------------------------------------===//
+
+mlir::OperandRange cir::TryCallOp::getArgOperands() {
+  if (isIndirect())
+    return getArgs().drop_front(1);
+  return getArgs();
+}
+
+mlir::MutableOperandRange cir::TryCallOp::getArgOperandsMutable() {
+  mlir::MutableOperandRange args = getArgsMutable();
+  if (isIndirect())
+    return args.slice(1, args.size() - 1);
+  return args;
+}
+
+mlir::Value cir::TryCallOp::getIndirectCall() {
+  assert(isIndirect());
+  return getOperand(0);
+}
+
+/// Return the operand at index 'i'.
+Value cir::TryCallOp::getArgOperand(unsigned i) {
+  if (isIndirect())
+    ++i;
+  return getOperand(i);
+}
+
+/// Return the number of operands.
+unsigned cir::TryCallOp::getNumArgOperands() {
+  if (isIndirect())
+    return this->getOperation()->getNumOperands() - 1;
+  return this->getOperation()->getNumOperands();
+}
+
+LogicalResult
+cir::TryCallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  return verifyCallCommInSymbolUses(*this, symbolTable);
+}
+
+mlir::ParseResult cir::TryCallOp::parse(mlir::OpAsmParser &parser,
+                                        mlir::OperationState &result) {
+  return parseCallCommon(parser, result, /*hasDestinationBlocks=*/true);
+}
+
+void cir::TryCallOp::print(::mlir::OpAsmPrinter &p) {
+  mlir::Value indirectCallee = isIndirect() ? getIndirectCall() : nullptr;
+  cir::SideEffect sideEffect = getSideEffect();
+  printCallCommon(*this, getCalleeAttr(), indirectCallee, p, getNothrow(),
+                  sideEffect, getNormalDest(), getUnwindDest());
+}
+
 //===----------------------------------------------------------------------===//
 // ReturnOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CIR/IR/try-call.cir b/clang/test/CIR/IR/try-call.cir
new file mode 100644
index 0000000000000..39db43aee40c1
--- /dev/null
+++ b/clang/test/CIR/IR/try-call.cir
@@ -0,0 +1,35 @@
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+
+cir.func private @division(%a: !s32i, %b: !s32i) -> !s32i
+
+cir.func @flatten_structure_with_try_call_op() {
+   %a = cir.const #cir.int<1> : !s32i
+   %b = cir.const #cir.int<2> : !s32i
+   %3 = cir.try_call @division(%a, %b) ^normal, ^unwind : (!s32i, !s32i) -> !s32i
+ ^normal:
+   cir.br ^end
+ ^unwind:
+   cir.br ^end
+ ^end:
+   cir.return
+}
+
+// CHECK: cir.func private @division(!s32i, !s32i) -> !s32i
+
+// CHECK: cir.func @flatten_structure_with_try_call_op() {
+// CHECK-NEXT:   %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT:   %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CHECK-NEXT:   %[[CALL:.*]] = cir.try_call @division(%[[CONST_1]], %[[CONST_2]]) ^[[NORMAL:.*]], ^[[UNWIND:.*]] : (!s32i, !s32i) -> !s32i
+// CHECK-NEXT: ^[[NORMAL]]:
+// CHECK-NEXT:   cir.br ^[[END:.*]]
+// CHECK-NEXT: ^[[UNWIND]]:
+// CHECK-NEXT:   cir.br ^[[END:.*]]
+// CHECK-NEXT: ^[[END]]:
+// CHECK-NEXT:   cir.return
+// CHECK-NEXT: }
+
+}

From 8f36135aea5dfbb5f090ca9ad055094c9913f735 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 27 Nov 2025 21:23:08 +0000
Subject: [PATCH 08/13] [VPlan] Add m_Intrinsic matcher that takes a variable
 intrinsic ID (NFC)

Add a variant of m_Intrinsic that matches a variable runtime ID.
---
 .../Vectorize/VPlanConstruction.cpp           | 20 +++++--------------
 .../Transforms/Vectorize/VPlanPatternMatch.h  |  5 +++++
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 5fbd61a929fe2..92969c8ed9ec0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -845,23 +845,13 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
     if (!MinMaxR)
       return nullptr;
 
-    auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
-    if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
-        !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
+    // Check that MinMaxR is a VPWidenIntrinsicRecipe or VPReplicateRecipe
+    // with an intrinsic that matches the reduction kind.
+    Intrinsic::ID ExpectedIntrinsicID =
+        getMinMaxReductionIntrinsicOp(RedPhiR->getRecurrenceKind());
+    if (!match(MinMaxR, m_Intrinsic(ExpectedIntrinsicID)))
       return nullptr;
 
-#ifndef NDEBUG
-    Intrinsic::ID RdxIntrinsicId =
-        RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
-                                                           : Intrinsic::minnum;
-    assert(((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
-             cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
-                 RdxIntrinsicId) ||
-            (RepR && cast<IntrinsicInst>(RepR->getUnderlyingInstr())
-                             ->getIntrinsicID() == RdxIntrinsicId)) &&
-           "Intrinsic did not match recurrence kind");
-#endif
-
     if (MinMaxR->getOperand(0) == RedPhiR)
       return MinMaxR->getOperand(1);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 79275e7bffdd0..07dfe31eea46d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -856,6 +856,11 @@ template <Intrinsic::ID IntrID> inline IntrinsicID_match m_Intrinsic() {
   return IntrinsicID_match(IntrID);
 }
 
+/// Match intrinsic calls with a runtime intrinsic ID.
+inline IntrinsicID_match m_Intrinsic(Intrinsic::ID IntrID) {
+  return IntrinsicID_match(IntrID);
+}
+
 template <Intrinsic::ID IntrID, typename T0>
 inline typename m_Intrinsic_Ty<T0>::Ty m_Intrinsic(const T0 &Op0) {
   return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));

From 07d14cb6d3e0319b4f95bfaca1502c4a8dc02910 Mon Sep 17 00:00:00 2001
From: owenca <owenpiano@gmail.com>
Date: Thu, 27 Nov 2025 13:51:27 -0800
Subject: [PATCH 09/13] [clang-format][NFC] Remove the parameter of
 parseRequires...() (#169773)

---
 clang/lib/Format/UnwrappedLineParser.cpp | 87 +++++++++---------------
 clang/lib/Format/UnwrappedLineParser.h   |  4 +-
 2 files changed, 36 insertions(+), 55 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 50edca43ebb92..19c83d3910902 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2363,12 +2363,9 @@ bool UnwrappedLineParser::tryToParseLambda() {
       Arrow = FormatTok;
       nextToken();
       break;
-    case tok::kw_requires: {
-      auto *RequiresToken = FormatTok;
-      nextToken();
-      parseRequiresClause(RequiresToken);
+    case tok::kw_requires:
+      parseRequiresClause();
       break;
-    }
     case tok::equal:
       if (!InTemplateParameterList)
         return true;
@@ -2580,12 +2577,9 @@ bool UnwrappedLineParser::parseBracedList(bool IsAngleBracket, bool IsEnum) {
       if (IsEnum && !Style.AllowShortEnumsOnASingleLine)
         addUnwrappedLine();
       break;
-    case tok::kw_requires: {
-      auto *RequiresToken = FormatTok;
-      nextToken();
-      parseRequiresExpression(RequiresToken);
+    case tok::kw_requires:
+      parseRequiresExpression();
       break;
-    }
     default:
       nextToken();
       break;
@@ -2727,12 +2721,9 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType,
       else
         nextToken();
       break;
-    case tok::kw_requires: {
-      auto RequiresToken = FormatTok;
-      nextToken();
-      parseRequiresExpression(RequiresToken);
+    case tok::kw_requires:
+      parseRequiresExpression();
       break;
-    }
     case tok::ampamp:
       if (AmpAmpTokenType != TT_Unknown)
         FormatTok->setFinalizedType(AmpAmpTokenType);
@@ -3467,23 +3458,20 @@ void UnwrappedLineParser::parseAccessSpecifier() {
 /// \returns true if it parsed a clause.
 bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
   assert(FormatTok->is(tok::kw_requires) && "'requires' expected");
-  auto RequiresToken = FormatTok;
 
   // We try to guess if it is a requires clause, or a requires expression. For
-  // that we first consume the keyword and check the next token.
-  nextToken();
-
-  switch (FormatTok->Tok.getKind()) {
+  // that we first check the next token.
+  switch (Tokens->peekNextToken(/*SkipComment=*/true)->Tok.getKind()) {
   case tok::l_brace:
     // This can only be an expression, never a clause.
-    parseRequiresExpression(RequiresToken);
+    parseRequiresExpression();
     return false;
   case tok::l_paren:
     // Clauses and expression can start with a paren, it's unclear what we have.
     break;
   default:
     // All other tokens can only be a clause.
-    parseRequiresClause(RequiresToken);
+    parseRequiresClause();
     return true;
   }
 
@@ -3494,13 +3482,13 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
   // requires (C<T> && ...
 
   // But first let's look behind.
-  auto *PreviousNonComment = RequiresToken->getPreviousNonComment();
+  auto *PreviousNonComment = FormatTok->getPreviousNonComment();
 
   if (!PreviousNonComment ||
       PreviousNonComment->is(TT_RequiresExpressionLBrace)) {
     // If there is no token, or an expression left brace, we are a requires
     // clause within a requires expression.
-    parseRequiresClause(RequiresToken);
+    parseRequiresClause();
     return true;
   }
 
@@ -3512,7 +3500,7 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
   case tok::star:
   case tok::amp:
     // This is a requires clause.
-    parseRequiresClause(RequiresToken);
+    parseRequiresClause();
     return true;
   case tok::ampamp: {
     // This can be either:
@@ -3523,7 +3511,7 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
     // void member(...) const && requires (C<T> ...
     auto PrevPrev = PreviousNonComment->getPreviousNonComment();
     if ((PrevPrev && PrevPrev->is(tok::kw_const)) || !SeenEqual) {
-      parseRequiresClause(RequiresToken);
+      parseRequiresClause();
       return true;
     }
     break;
@@ -3531,11 +3519,11 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
   default:
     if (PreviousNonComment->isTypeOrIdentifier(LangOpts)) {
       // This is a requires clause.
-      parseRequiresClause(RequiresToken);
+      parseRequiresClause();
       return true;
     }
     // It's an expression.
-    parseRequiresExpression(RequiresToken);
+    parseRequiresExpression();
     return false;
   }
 
@@ -3564,7 +3552,7 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
     case tok::comma:
       if (OpenAngles == 0) {
         FormatTok = Tokens->setPosition(StoredPosition);
-        parseRequiresExpression(RequiresToken);
+        parseRequiresExpression();
         return false;
       }
       break;
@@ -3579,7 +3567,7 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
     case tok::identifier:
       if (FoundType && !LastWasColonColon && OpenAngles == 0) {
         FormatTok = Tokens->setPosition(StoredPosition);
-        parseRequiresExpression(RequiresToken);
+        parseRequiresExpression();
         return false;
       }
       FoundType = true;
@@ -3594,7 +3582,7 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
     default:
       if (NextToken->isTypeName(LangOpts)) {
         FormatTok = Tokens->setPosition(StoredPosition);
-        parseRequiresExpression(RequiresToken);
+        parseRequiresExpression();
         return false;
       }
       break;
@@ -3602,31 +3590,29 @@ bool UnwrappedLineParser::parseRequires(bool SeenEqual) {
   }
   // This seems to be a complicated expression, just assume it's a clause.
   FormatTok = Tokens->setPosition(StoredPosition);
-  parseRequiresClause(RequiresToken);
+  parseRequiresClause();
   return true;
 }
 
 /// Parses a requires clause.
-/// \param RequiresToken The requires keyword token, which starts this clause.
-/// \pre We need to be on the next token after the requires keyword.
 /// \sa parseRequiresExpression
 ///
 /// Returns if it either has finished parsing the clause, or it detects, that
 /// the clause is incorrect.
-void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) {
-  assert(FormatTok->getPreviousNonComment() == RequiresToken);
-  assert(RequiresToken->is(tok::kw_requires) && "'requires' expected");
+void UnwrappedLineParser::parseRequiresClause() {
+  assert(FormatTok->is(tok::kw_requires) && "'requires' expected");
 
   // If there is no previous token, we are within a requires expression,
   // otherwise we will always have the template or function declaration in front
   // of it.
   bool InRequiresExpression =
-      !RequiresToken->Previous ||
-      RequiresToken->Previous->is(TT_RequiresExpressionLBrace);
+      !FormatTok->Previous ||
+      FormatTok->Previous->is(TT_RequiresExpressionLBrace);
 
-  RequiresToken->setFinalizedType(InRequiresExpression
-                                      ? TT_RequiresClauseInARequiresExpression
-                                      : TT_RequiresClause);
+  FormatTok->setFinalizedType(InRequiresExpression
+                                  ? TT_RequiresClauseInARequiresExpression
+                                  : TT_RequiresClause);
+  nextToken();
 
   // NOTE: parseConstraintExpression is only ever called from this function.
   // It could be inlined into here.
@@ -3637,17 +3623,15 @@ void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) {
 }
 
 /// Parses a requires expression.
-/// \param RequiresToken The requires keyword token, which starts this clause.
-/// \pre We need to be on the next token after the requires keyword.
 /// \sa parseRequiresClause
 ///
 /// Returns if it either has finished parsing the expression, or it detects,
 /// that the expression is incorrect.
-void UnwrappedLineParser::parseRequiresExpression(FormatToken *RequiresToken) {
-  assert(FormatTok->getPreviousNonComment() == RequiresToken);
-  assert(RequiresToken->is(tok::kw_requires) && "'requires' expected");
+void UnwrappedLineParser::parseRequiresExpression() {
+  assert(FormatTok->is(tok::kw_requires) && "'requires' expected");
 
-  RequiresToken->setFinalizedType(TT_RequiresExpression);
+  FormatTok->setFinalizedType(TT_RequiresExpression);
+  nextToken();
 
   if (FormatTok->is(tok::l_paren)) {
     FormatTok->setFinalizedType(TT_RequiresExpressionLParen);
@@ -3687,12 +3671,9 @@ void UnwrappedLineParser::parseConstraintExpression() {
     bool LambdaThisTimeAllowed = std::exchange(LambdaNextTimeAllowed, false);
 
     switch (FormatTok->Tok.getKind()) {
-    case tok::kw_requires: {
-      auto RequiresToken = FormatTok;
-      nextToken();
-      parseRequiresExpression(RequiresToken);
+    case tok::kw_requires:
+      parseRequiresExpression();
       break;
-    }
 
     case tok::l_paren:
       if (!TopLevelParensAllowed)
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 0161a5063ad40..86022d9b316c6 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -169,8 +169,8 @@ class UnwrappedLineParser {
   bool parseEnum();
   bool parseStructLike();
   bool parseRequires(bool SeenEqual);
-  void parseRequiresClause(FormatToken *RequiresToken);
-  void parseRequiresExpression(FormatToken *RequiresToken);
+  void parseRequiresClause();
+  void parseRequiresExpression();
   void parseConstraintExpression();
   void parseCppExportBlock();
   void parseNamespaceOrExportBlock(unsigned AddLevels);

From 8459508227dd30ce32f870fbc9109fe95d51f4db Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 27 Nov 2025 21:52:15 +0000
Subject: [PATCH 10/13] [VPlan] Handle scalar VPWidenPointerInd in
 convertToConcreteRecipes. (#169338)

In some case, VPWidenPointerInductions become only used by scalars after
legalizeAndOptimizationInducftions was already run, for example due to
some VPlan optimizations.

Move the code to scalarize VPWidenPointerInductions to a helper and use
it if needed.

This fixes a crash after #148274 in the added test case.

Fixes https://github.com/llvm/llvm-project/issues/169780
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  38 +++++--
 .../LoopVectorize/RISCV/pointer-induction.ll  | 100 ++++++++++++++++++
 2 files changed, 128 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4b7f90118374b..67aca48bcaf8f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -703,6 +703,23 @@ static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
   return Users.takeVector();
 }
 
+/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
+/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
+/// generates scalar values.
+static VPValue *
+scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV,
+                                 VPlan &Plan, VPBuilder &Builder) {
+  const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
+  VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
+  VPValue *StepV = PtrIV->getOperand(1);
+  VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
+      Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
+      nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
+
+  return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
+                              PtrIV->getDebugLoc(), "next.gep");
+}
+
 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
@@ -755,16 +772,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
       if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
         continue;
 
-      const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
-      VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
-      VPValue *StepV = PtrIV->getOperand(1);
-      VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
-          Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
-          nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
-
-      VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
-                                             PtrIV->getDebugLoc(), "next.gep");
-
+      VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
       PtrIV->replaceAllUsesWith(PtrAdd);
       continue;
     }
@@ -3575,6 +3583,16 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
       }
 
       if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+        // If the recipe only generates scalars, scalarize it instead of
+        // expanding it.
+        if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
+          VPBuilder Builder(WidenIVR);
+          VPValue *PtrAdd =
+              scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
+          WidenIVR->replaceAllUsesWith(PtrAdd);
+          ToRemove.push_back(WidenIVR);
+          continue;
+        }
         expandVPWidenPointerInduction(WidenIVR, TypeInfo);
         ToRemove.push_back(WidenIVR);
         continue;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
index fa710cb8d65b1..164a5cd1ae3c0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
@@ -61,4 +61,104 @@ exit:
   ret void
 }
 
+define i1 @scalarize_ptr_induction(ptr %start, ptr %end, ptr noalias %dst, i1 %c) #1 {
+; CHECK-LABEL: define i1 @scalarize_ptr_induction(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]], ptr noalias [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[START5:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END4:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[END4]], -12
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 12
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[END1]], -12
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], [[START2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = udiv i64 [[TMP7]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 12
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 8
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP10]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[START]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[END]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP3]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 2 x i64> [[TMP13]], splat (i64 12)
+; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 2 x i32> [[TMP18]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 2 x i64> [[TMP19]], splat (i64 -7070675565921424023)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[TMP20]], splat (i64 -4)
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP21]], <vscale x 2 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 12
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <vscale x 2 x ptr> [[TMP16]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 12, [[TMP26]]
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 2 x i1> [[TMP17]], i64 [[TMP29]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[EXT:%.*]] = zext i32 [[L]] to i64
+; CHECK-NEXT:    [[UNUSED:%.*]] = load i32, ptr [[PTR_IV]], align 4
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i64 [[EXT]], -7070675565921424023
+; CHECK-NEXT:    [[MUL2:%.*]] = add i64 [[MUL1]], -4
+; CHECK-NEXT:    store i64 [[MUL2]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 12
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 true, i1 false
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[CMP_LCSSA:%.*]] = phi i1 [ [[CMP]], %[[LOOP]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i1 [[CMP_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+  %gep = getelementptr i8, ptr %ptr.iv, i64 4
+  %l = load i32, ptr %gep, align 4
+  %ext = zext i32 %l to i64
+  %unused = load i32, ptr %ptr.iv, align 4
+  %mul1 = mul i64 %ext, -7070675565921424023
+  %mul2 = add i64 %mul1, -4
+  store i64 %mul2, ptr %dst, align 1
+  %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 12
+  %cmp = icmp eq ptr %ptr.iv.next, %end
+  %or.cond = select i1 %cmp, i1 true, i1 false
+  br i1 %or.cond, label %exit, label %loop
+
+exit:
+  ret i1 %cmp
+}
+
 attributes #0 = { "target-features"="+v" }
+attributes #1 = { "target-cpu"="sifive-p670" }

From 06c8ee61ab80305be88380e6aa2f1b2fe32f859d Mon Sep 17 00:00:00 2001
From: "Deric C." <cheung.deric@gmail.com>
Date: Thu, 27 Nov 2025 13:58:47 -0800
Subject: [PATCH 11/13] [NFC] [DirectX] Make DirectX codegen test
 `CBufferAccess/gep-ce-two-uses.ll` more strict (#169855)

Continuation of PR #169848 to address PR comments.

This PR makes the test more strict by adding CHECKs to ensure the loads
are indeed using the same or different GEPs.
---
 .../DirectX/CBufferAccess/gep-ce-two-uses.ll        | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll
index 97896e598de90..8fba0a4187e81 100644
--- a/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll
@@ -17,7 +17,8 @@
 define void @f(ptr %dst) {
 entry:
   ; CHECK: [[PTR:%.*]] = call ptr addrspace(2) @llvm.dx.resource.getpointer.{{.*}}(target("dx.CBuffer", %__cblayout_CB) {{%.*}}, i32 0)
-  ; CHECK: getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR]], i32 16
+  ; CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR]], i32 16
+  ; CHECK-COUNT-2: load float, ptr addrspace(2) [[GEP]]
   %a1 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 16), align 4
   store float %a1, ptr %dst, align 32
 
@@ -30,15 +31,17 @@ entry:
 ; CHECK: define void @g
 define void @g(ptr %dst) {
 entry:
-  ; CHECK: [[PTR:%.*]] = call ptr addrspace(2) @llvm.dx.resource.getpointer.{{.*}}(target("dx.CBuffer", %__cblayout_CB) {{%.*}}, i32 0)
-  ; CHECK: getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR]], i32 16
+  ; CHECK: [[PTR1:%.*]] = call ptr addrspace(2) @llvm.dx.resource.getpointer.{{.*}}(target("dx.CBuffer", %__cblayout_CB) {{%.*}}, i32 0)
+  ; CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR1]], i32 16
+  ; CHECK: load float, ptr addrspace(2) [[GEP1]]
   %a1 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 16), align 4
   store float %a1, ptr %dst, align 32
   br label %next
 
 next:
-  ; CHECK: [[PTR:%.*]] = call ptr addrspace(2) @llvm.dx.resource.getpointer.{{.*}}(target("dx.CBuffer", %__cblayout_CB) {{%.*}}, i32 0)
-  ; CHECK: getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR]], i32 16
+  ; CHECK: [[PTR2:%.*]] = call ptr addrspace(2) @llvm.dx.resource.getpointer.{{.*}}(target("dx.CBuffer", %__cblayout_CB) {{%.*}}, i32 0)
+  ; CHECK: [[GEP2:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[PTR2]], i32 16
+  ; CHECK: load float, ptr addrspace(2) [[GEP2]]
   %a2 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 16), align 4
   store float %a2, ptr %dst, align 32
 

From 583fba35247d8ffc87e0df9f9eb49ab02ea3bb8e Mon Sep 17 00:00:00 2001
From: actink <actink@163.com>
Date: Fri, 28 Nov 2025 08:54:47 +0800
Subject: [PATCH 12/13] [InstCombine] fold icmp of select with invertible shl
 (#147182)

Proof: https://alive2.llvm.org/ce/z/a5fzlJ
Closes https://github.com/llvm/llvm-project/issues/146642

---------

Co-authored-by: Yingwei Zheng <dtcxzyw@qq.com>
---
 .../InstCombine/InstCombineCompares.cpp       |  32 ++++-
 .../Transforms/InstCombine/icmp-select.ll     | 117 ++++++++++++++++++
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cf6e7315114dc..33eee8e059486 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5886,6 +5886,12 @@ static void collectOffsetOp(Value *V, SmallVectorImpl<OffsetOp> &Offsets,
     Offsets.emplace_back(Instruction::Xor, Inst->getOperand(1));
     Offsets.emplace_back(Instruction::Xor, Inst->getOperand(0));
     break;
+  case Instruction::Shl:
+    if (Inst->hasNoSignedWrap())
+      Offsets.emplace_back(Instruction::AShr, Inst->getOperand(1));
+    if (Inst->hasNoUnsignedWrap())
+      Offsets.emplace_back(Instruction::LShr, Inst->getOperand(1));
+    break;
   case Instruction::Select:
     if (AllowRecursion) {
       collectOffsetOp(Inst->getOperand(1), Offsets, /*AllowRecursion=*/false);
@@ -5942,9 +5948,31 @@ static Instruction *foldICmpEqualityWithOffset(ICmpInst &I,
   collectOffsetOp(Op1, OffsetOps, /*AllowRecursion=*/true);
 
   auto ApplyOffsetImpl = [&](Value *V, unsigned BinOpc, Value *RHS) -> Value * {
+    switch (BinOpc) {
+    // V = shl nsw X, RHS => X = ashr V, RHS
+    case Instruction::AShr: {
+      const APInt *CV, *CRHS;
+      if (!(match(V, m_APInt(CV)) && match(RHS, m_APInt(CRHS)) &&
+            CV->ashr(*CRHS).shl(*CRHS) == *CV) &&
+          !match(V, m_NSWShl(m_Value(), m_Specific(RHS))))
+        return nullptr;
+      break;
+    }
+    // V = shl nuw X, RHS => X = lshr V, RHS
+    case Instruction::LShr: {
+      const APInt *CV, *CRHS;
+      if (!(match(V, m_APInt(CV)) && match(RHS, m_APInt(CRHS)) &&
+            CV->lshr(*CRHS).shl(*CRHS) == *CV) &&
+          !match(V, m_NUWShl(m_Value(), m_Specific(RHS))))
+        return nullptr;
+      break;
+    }
+    default:
+      break;
+    }
+
     Value *Simplified = simplifyBinOp(BinOpc, V, RHS, SQ);
-    // Avoid infinite loops by checking if RHS is an identity for the BinOp.
-    if (!Simplified || Simplified == V)
+    if (!Simplified)
       return nullptr;
     // Reject constant expressions as they don't simplify things.
     if (isa<Constant>(Simplified) && !match(Simplified, m_ImmConstant()))
diff --git a/llvm/test/Transforms/InstCombine/icmp-select.ll b/llvm/test/Transforms/InstCombine/icmp-select.ll
index c6c0ba385a6fd..c29527a3c3c5e 100644
--- a/llvm/test/Transforms/InstCombine/icmp-select.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-select.ll
@@ -835,3 +835,120 @@ define i1 @discr_eq_constantexpr(ptr %p) {
   %cmp = icmp eq i64 %sub, -1
   ret i1 %cmp
 }
+
+define i1 @shl_nsw_eq_simplify_zero_to_self(i8 %a, i1 %cond) {
+; CHECK-LABEL: @shl_nsw_eq_simplify_zero_to_self(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[COND:%.*]] to i8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %a, 3
+  %sel = select i1 %cond, i8 8, i8 0
+  %cmp = icmp eq i8 %shl, %sel
+  ret i1 %cmp
+}
+
+define i1 @shl_nsw_eq(i8 %a, i1 %cond) {
+; CHECK-LABEL: @shl_nsw_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], i8 1, i8 -15
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %a, 3
+  %sel = select i1 %cond, i8 8, i8 -120
+  %cmp = icmp eq i8 %shl, %sel
+  ret i1 %cmp
+}
+
+define i1 @shl_nuw_eq(i8 %a, i1 %cond) {
+; CHECK-LABEL: @shl_nuw_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], i8 1, i8 17
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nuw i8 %a, 3
+  %sel = select i1 %cond, i8 8, i8 -120
+  %cmp = icmp eq i8 %shl, %sel
+  ret i1 %cmp
+}
+
+define i1 @shl_nsw_failed_to_simplify(i8 %a, i1 %cond) {
+; CHECK-LABEL: @shl_nsw_failed_to_simplify(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[NOT_COND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[CMP:%.*]] = select i1 [[NOT_COND]], i1 [[CMP1]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %a, 4
+  %sel = select i1 %cond, i8 8, i8 16
+  %cmp = icmp eq i8 %shl, %sel
+  ret i1 %cmp
+}
+
+define i1 @shl_nuw_failed_to_simplify(i8 %a, i1 %cond) {
+; CHECK-LABEL: @shl_nuw_failed_to_simplify(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[A:%.*]], 4
+; CHECK-NEXT:    [[NOT_COND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[CMP:%.*]] = select i1 [[NOT_COND]], i1 [[CMP1]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nuw i8 %a, 3
+  %sel = select i1 %cond, i8 -1, i8 32
+  %cmp = icmp eq i8 %shl, %sel
+  ret i1 %cmp
+}
+
+define i1 @shl_failed_to_simplify(i8 %a, i1 %cond) {
+; CHECK-LABEL: @shl_failed_to_simplify(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[A:%.*]], 3
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i8 8, i8 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl i8 %a, 3
+  %sel = select i1 %cond, i8 8, i8 32
+  %cmp = icmp eq i8 %shl, %sel
+  ret i1 %cmp
+}
+
+define i1 @shl_nuw_ne(i8 %a, i8 %b, i8 %c, i1 %cond) {
+; CHECK-LABEL: @shl_nuw_ne(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], i8 [[B:%.*]], i8 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl_a = shl nuw i8 %a, 3
+  %shl_b = shl nuw i8 %b, 3
+  %sel = select i1 %cond, i8 %shl_b, i8 32
+  %cmp = icmp ne i8 %sel, %shl_a
+  ret i1 %cmp
+}
+
+define i1 @shl_const_phi_failed_to_simplify(i64 %indvars, i32 %conv) {
+; CHECK-LABEL: @shl_const_phi_failed_to_simplify(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i64 [[INDVARS:%.*]], 1
+; CHECK-NEXT:    br i1 [[CMP_SLT]], label [[END:%.*]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[CONST_PHI:%.*]] = phi i32 [ 0, [[THEN]] ], [ 65535, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SHL_NUW:%.*]] = shl nuw i32 [[CONV:%.*]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_SLT]], i32 [[CONST_PHI]], i32 [[SHL_NUW]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SEL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cmp_slt = icmp slt i64 %indvars, 1
+  br i1 %cmp_slt, label %end, label %then
+
+then:
+  br label %end
+
+end:
+  %const_phi = phi i32 [ 0, %then ], [ 65535, %entry ]
+  %shl_nuw = shl nuw i32 %conv, 31
+  %sel = select i1 %cmp_slt, i32 %const_phi, i32 %shl_nuw
+  %cmp = icmp eq i32 %sel, 0
+  ret i1 %cmp
+}

From aa2a7f4eb1b130e9e41d9a87bffcaa47876a97fc Mon Sep 17 00:00:00 2001
From: "Mr. Walls" <reactive-firewall@users.noreply.github.com>
Date: Thu, 27 Nov 2025 17:14:05 -0800
Subject: [PATCH 13/13] [libc++] Fix the locale base API on Linux with musl
 (#167980)

This pull request addresses an issue encountered when building
**libcxx** with certain configurations (`-D_LIBCPP_HAS_MUSL_LIBC` &
`-D__linux__`) that lack the `_GNU_SOURCE` definition. Specifically,
this issue arises if the system **musl libc** is built with
`_BSD_SOURCE` instead of `_GNU_SOURCE`. The resultant configuration
leads to problems with the "Strtonum functions" in the file
[libcxx/include/__locale_dir/support/linux.h](https://github.com/llvm/llvm-project/tree/master/libcxx/include/__locale_dir/support/linux.h),
affecting the following functions:

- `__strtof`
- `__strtod`
- `__strtold`

**Error messages displayed include**:
```console
error: no member named 'strtof_l' in the global namespace
```
```console
error: no member named 'strtod_l' in the global namespace
```
```console
error: no member named 'strtold_l' in the global namespace
```

For more insight, relevant code can be accessed
[here](https://github.com/llvm/llvm-project/blob/79cd1b7a25cdbf42c7234999ae9bc51db30af1f0/libcxx/include/__locale_dir/support/linux.h#L85-L95).
---
 libcxx/include/__locale_dir/support/linux.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/libcxx/include/__locale_dir/support/linux.h b/libcxx/include/__locale_dir/support/linux.h
index 1a589be49bf1d..deb657d4faced 100644
--- a/libcxx/include/__locale_dir/support/linux.h
+++ b/libcxx/include/__locale_dir/support/linux.h
@@ -83,15 +83,30 @@ inline _LIBCPP_HIDE_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc) {
 // Strtonum functions
 //
 inline _LIBCPP_HIDE_FROM_ABI float __strtof(const char* __nptr, char** __endptr, __locale_t __loc) {
+#if !_LIBCPP_HAS_MUSL_LIBC || defined(_GNU_SOURCE)
   return ::strtof_l(__nptr, __endptr, __loc);
+#else
+  (void)__loc;
+  return ::strtof(__nptr, __endptr);
+#endif
 }
 
 inline _LIBCPP_HIDE_FROM_ABI double __strtod(const char* __nptr, char** __endptr, __locale_t __loc) {
+#if !_LIBCPP_HAS_MUSL_LIBC || defined(_GNU_SOURCE)
   return ::strtod_l(__nptr, __endptr, __loc);
+#else
+  (void)__loc;
+  return ::strtod(__nptr, __endptr);
+#endif
 }
 
 inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __endptr, __locale_t __loc) {
+#if !_LIBCPP_HAS_MUSL_LIBC || defined(_GNU_SOURCE)
   return ::strtold_l(__nptr, __endptr, __loc);
+#else
+  (void)__loc;
+  return ::strtold(__nptr, __endptr);
+#endif
 }
 
 //