diff --git a/clang/lib/Index/IndexingAction.cpp b/clang/lib/Index/IndexingAction.cpp index 41f1008a475fe..47e0ab3a01886 100644 --- a/clang/lib/Index/IndexingAction.cpp +++ b/clang/lib/Index/IndexingAction.cpp @@ -23,7 +23,37 @@ using namespace clang::index; namespace { -class IndexASTConsumer : public ASTConsumer { +class IndexPPCallbacks final : public PPCallbacks { + std::shared_ptr IndexCtx; + +public: + IndexPPCallbacks(std::shared_ptr IndexCtx) + : IndexCtx(std::move(IndexCtx)) {} + + void MacroExpands(const Token &MacroNameTok, const MacroDefinition &MD, + SourceRange Range, const MacroArgs *Args) override { + IndexCtx->handleMacroReference(*MacroNameTok.getIdentifierInfo(), + Range.getBegin(), *MD.getMacroInfo()); + } + + void MacroDefined(const Token &MacroNameTok, + const MacroDirective *MD) override { + IndexCtx->handleMacroDefined(*MacroNameTok.getIdentifierInfo(), + MacroNameTok.getLocation(), + *MD->getMacroInfo()); + } + + void MacroUndefined(const Token &MacroNameTok, const MacroDefinition &MD, + const MacroDirective *Undef) override { + if (!MD.getMacroInfo()) // Ignore noop #undef. + return; + IndexCtx->handleMacroUndefined(*MacroNameTok.getIdentifierInfo(), + MacroNameTok.getLocation(), + *MD.getMacroInfo()); + } +}; + +class IndexASTConsumer final : public ASTConsumer { std::shared_ptr PP; std::shared_ptr IndexCtx; @@ -37,6 +67,7 @@ class IndexASTConsumer : public ASTConsumer { IndexCtx->setASTContext(Context); IndexCtx->getDataConsumer().initialize(Context); IndexCtx->getDataConsumer().setPreprocessor(PP); + PP->addPPCallbacks(std::make_unique(IndexCtx)); } bool HandleTopLevelDecl(DeclGroupRef DG) override { @@ -55,36 +86,6 @@ class IndexASTConsumer : public ASTConsumer { } }; -class IndexPPCallbacks : public PPCallbacks { - std::shared_ptr IndexCtx; - -public: - IndexPPCallbacks(std::shared_ptr IndexCtx) - : IndexCtx(std::move(IndexCtx)) {} - - void MacroExpands(const Token &MacroNameTok, const MacroDefinition &MD, - SourceRange Range, const MacroArgs *Args) override { - IndexCtx->handleMacroReference(*MacroNameTok.getIdentifierInfo(), - Range.getBegin(), *MD.getMacroInfo()); - } - - void MacroDefined(const Token &MacroNameTok, - const MacroDirective *MD) override { - IndexCtx->handleMacroDefined(*MacroNameTok.getIdentifierInfo(), - MacroNameTok.getLocation(), - *MD->getMacroInfo()); - } - - void MacroUndefined(const Token &MacroNameTok, const MacroDefinition &MD, - const MacroDirective *Undef) override { - if (!MD.getMacroInfo()) // Ignore noop #undef. - return; - IndexCtx->handleMacroUndefined(*MacroNameTok.getIdentifierInfo(), - MacroNameTok.getLocation(), - *MD.getMacroInfo()); - } -}; - class IndexActionBase { protected: std::shared_ptr DataConsumer; @@ -101,16 +102,12 @@ class IndexActionBase { IndexCtx); } - std::unique_ptr createIndexPPCallbacks() { - return std::make_unique(IndexCtx); - } - void finish() { DataConsumer->finish(); } }; -class IndexAction : public ASTFrontendAction, IndexActionBase { +class IndexAction final : public ASTFrontendAction, IndexActionBase { public: IndexAction(std::shared_ptr DataConsumer, IndexingOptions Opts) @@ -122,18 +119,13 @@ class IndexAction : public ASTFrontendAction, IndexActionBase { return createIndexASTConsumer(CI); } - bool BeginSourceFileAction(clang::CompilerInstance &CI) override { - CI.getPreprocessor().addPPCallbacks(createIndexPPCallbacks()); - return true; - } - void EndSourceFileAction() override { FrontendAction::EndSourceFileAction(); finish(); } }; -class WrappingIndexAction : public WrapperFrontendAction, IndexActionBase { +class WrappingIndexAction final : public WrapperFrontendAction, IndexActionBase { bool IndexActionFailed = false; public: @@ -158,12 +150,6 @@ class WrappingIndexAction : public WrapperFrontendAction, IndexActionBase { return std::make_unique(std::move(Consumers)); } - bool BeginSourceFileAction(clang::CompilerInstance &CI) override { - WrapperFrontendAction::BeginSourceFileAction(CI); - CI.getPreprocessor().addPPCallbacks(createIndexPPCallbacks()); - return true; - } - void EndSourceFileAction() override { // Invoke wrapped action's method. WrapperFrontendAction::EndSourceFileAction(); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 1b6390adc48d9..f6a705a70acd5 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -629,7 +629,9 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) { unsigned NumArgs = TheCall->getNumArgs(); if (NumArgs < 4) { - S.Diag(TheCall->getBeginLoc(), diag::err_typecheck_call_too_few_args); + S.Diag(TheCall->getBeginLoc(), + diag::err_typecheck_call_too_few_args_at_least) + << 0 << 4 << NumArgs; return true; } diff --git a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl index 8946911c093cc..7e8ec3d498ccb 100644 --- a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl @@ -158,6 +158,8 @@ kernel void enqueue_kernel_tests() { enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt); // expected-error{{illegal call to enqueue_kernel, incorrect argument types}} enqueue_kernel(default_queue, flags, ndrange, 1, 1); // expected-error{{illegal call to enqueue_kernel, incorrect argument types}} + + enqueue_kernel(default_queue, ndrange, ^{}); // expected-error{{too few arguments to function call, expected at least 4, have 3}} } // Diagnostic tests for get_kernel_work_group_size and allowed block parameter types in dynamic parallelism. diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/completion/TestCompletion.py b/lldb/packages/Python/lldbsuite/test/functionalities/completion/TestCompletion.py index 4357eef02e6d1..ea4a9c1a773fb 100644 --- a/lldb/packages/Python/lldbsuite/test/functionalities/completion/TestCompletion.py +++ b/lldb/packages/Python/lldbsuite/test/functionalities/completion/TestCompletion.py @@ -144,13 +144,12 @@ def test_log_file(self): self.complete_from_to('log enable lldb expr -f ' + src_dir, ['main.cpp']) - @skipIfWindows @skipIfFreeBSD # timing out on the FreeBSD buildbot def test_log_dir(self): # Complete our source directory. src_dir = os.path.dirname(os.path.realpath(__file__)) self.complete_from_to('log enable lldb expr -f ' + src_dir, - [src_dir + "/"]) + [src_dir + os.sep], turn_off_re_match=True) # @skipIfFreeBSD # timing out on the FreeBSD buildbot diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ed8c6b81b24a6..c40c1a551c241 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16430,12 +16430,51 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { } /// Convert a disguised subvector insertion into a shuffle: -/// insert_vector_elt V, (bitcast X from vector type), IdxC --> -/// bitcast(shuffle (bitcast V), (extended X), Mask) -/// Note: We do not use an insert_subvector node because that requires a legal -/// subvector type. SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { SDValue InsertVal = N->getOperand(1); + SDValue Vec = N->getOperand(0); + + // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), InsIndex) + // --> (vector_shuffle X, Y) + if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() && + InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(InsertVal.getOperand(1))) { + ShuffleVectorSDNode *SVN = cast(Vec.getNode()); + ArrayRef Mask = SVN->getMask(); + + SDValue X = Vec.getOperand(0); + SDValue Y = Vec.getOperand(1); + + // Vec's operand 0 is using indices from 0 to N-1 and + // operand 1 from N to 2N - 1, where N is the number of + // elements in the vectors. + int XOffset = -1; + if (InsertVal.getOperand(0) == X) { + XOffset = 0; + } else if (InsertVal.getOperand(0) == Y) { + XOffset = X.getValueType().getVectorNumElements(); + } + + if (XOffset != -1) { + SmallVector NewMask(Mask.begin(), Mask.end()); + + auto *ExtrIndex = cast(InsertVal.getOperand(1)); + NewMask[InsIndex] = XOffset + ExtrIndex->getZExtValue(); + assert(NewMask[InsIndex] < 2 * Vec.getValueType().getVectorNumElements() && + NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound"); + + SDValue LegalShuffle = + TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X, + Y, NewMask, DAG); + if (LegalShuffle) + return LegalShuffle; + } + } + + // insert_vector_elt V, (bitcast X from vector type), IdxC --> + // bitcast(shuffle (bitcast V), (extended X), Mask) + // Note: We do not use an insert_subvector node because that requires a + // legal subvector type. if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || !InsertVal.getOperand(0).getValueType().isVector()) return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 7ceab40c1a5da..ce64359d37d64 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1627,7 +1627,6 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS, MVT OpVT = LHS.getSimpleValueType(); ISD::CondCode CCCode = cast(CC)->get(); NeedInvert = false; - bool NeedSwap = false; switch (TLI.getCondCodeAction(CCCode, OpVT)) { default: llvm_unreachable("Unknown condition code action!"); case TargetLowering::Legal: @@ -1641,6 +1640,7 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS, return true; } // Swapping operands didn't work. Try inverting the condition. + bool NeedSwap = false; InvCC = getSetCCInverse(CCCode, OpVT.isInteger()); if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) { // If inverting the condition is not enough, try swapping operands diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index c882f8ccf01d6..dff7011061a47 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -428,16 +428,16 @@ void IRPosition::verify() { assert(KindOrArgNo >= 0 && "Expected argument or call site argument!"); assert((isa(AnchorVal) || isa(AnchorVal)) && "Expected call base or argument for positive attribute index!"); - if (auto *Arg = dyn_cast(AnchorVal)) { - assert(Arg->getArgNo() == unsigned(getArgNo()) && + if (isa(AnchorVal)) { + assert(cast(AnchorVal)->getArgNo() == unsigned(getArgNo()) && "Argument number mismatch!"); - assert(Arg == &getAssociatedValue() && "Associated value mismatch!"); + assert(cast(AnchorVal) == &getAssociatedValue() && + "Associated value mismatch!"); } else { - auto &CB = cast(*AnchorVal); - (void)CB; - assert(CB.arg_size() > unsigned(getArgNo()) && + assert(cast(*AnchorVal).arg_size() > unsigned(getArgNo()) && "Call site argument number mismatch!"); - assert(CB.getArgOperand(getArgNo()) == &getAssociatedValue() && + assert(cast(*AnchorVal).getArgOperand(getArgNo()) == + &getAssociatedValue() && "Associated value mismatch!"); } break; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 3a01d9cb4f4fa..1117586549e50 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3379,7 +3379,7 @@ foldICmpWithTruncSignExtendedVal(ICmpInst &I, // we should move shifts to the same hand of 'and', i.e. rewrite as // icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x) // We are only interested in opposite logical shifts here. -// One of the shifts can be truncated. For now, it can only be 'shl'. +// One of the shifts can be truncated. // If we can, we want to end up creating 'lshr' shift. static Value * foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, @@ -3413,14 +3413,6 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, "We did not look past any shifts while matching XShift though."); bool HadTrunc = WidestTy != I.getOperand(0)->getType(); - if (HadTrunc) { - // We did indeed have a truncation. For now, let's only proceed if the 'shl' - // was truncated, since that does not require any extra legality checks. - // FIXME: trunc-of-lshr. - if (!match(YShift, m_Shl(m_Value(), m_Value()))) - return nullptr; - } - // If YShift is a 'lshr', swap the shifts around. if (match(YShift, m_LShr(m_Value(), m_Value()))) std::swap(XShift, YShift); @@ -3462,16 +3454,68 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, /*isNUW=*/false, SQ.getWithInstruction(&I))); if (!NewShAmt) return nullptr; + NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy); + unsigned WidestBitWidth = WidestTy->getScalarSizeInBits(); + // Is the new shift amount smaller than the bit width? // FIXME: could also rely on ConstantRange. - if (!match(NewShAmt, m_SpecificInt_ICMP( - ICmpInst::Predicate::ICMP_ULT, - APInt(NewShAmt->getType()->getScalarSizeInBits(), - WidestTy->getScalarSizeInBits())))) + if (!match(NewShAmt, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt(WidestBitWidth, WidestBitWidth)))) return nullptr; + + // An extra legality check is needed if we had trunc-of-lshr. + if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) { + auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ, + WidestShift]() { + // It isn't obvious whether it's worth it to analyze non-constants here. + // Also, let's basically give up on non-splat cases, pessimizing vectors. + // If *any* of these preconditions matches we can perform the fold. + Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy() + ? NewShAmt->getSplatValue() + : NewShAmt; + // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold. + if (NewShAmtSplat && + (NewShAmtSplat->isNullValue() || + NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1)) + return true; + // We consider *min* leading zeros so a single outlier + // blocks the transform as opposed to allowing it. + if (auto *C = dyn_cast(NarrowestShift->getOperand(0))) { + KnownBits Known = computeKnownBits(C, SQ.DL); + unsigned MinLeadZero = Known.countMinLeadingZeros(); + // If the value being shifted has at most lowest bit set we can fold. + unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; + if (MaxActiveBits <= 1) + return true; + // Precondition: NewShAmt u<= countLeadingZeros(C) + if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero)) + return true; + } + if (auto *C = dyn_cast(WidestShift->getOperand(0))) { + KnownBits Known = computeKnownBits(C, SQ.DL); + unsigned MinLeadZero = Known.countMinLeadingZeros(); + // If the value being shifted has at most lowest bit set we can fold. + unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; + if (MaxActiveBits <= 1) + return true; + // Precondition: ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C) + if (NewShAmtSplat) { + APInt AdjNewShAmt = + (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger(); + if (AdjNewShAmt.ule(MinLeadZero)) + return true; + } + } + return false; // Can't tell if it's ok. + }; + if (!CanFold()) + return nullptr; + } + // All good, we can do this fold. - NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy); X = Builder.CreateZExt(X, WidestTy); + Y = Builder.CreateZExt(Y, WidestTy); // The shift is the same that was for X. Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr ? Builder.CreateLShr(X, NewShAmt) @@ -4981,9 +5025,9 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, // For scalars, SimplifyICmpInst should have already handled // the edge cases for us, so we just assert on them. // For vectors, we must handle the edge cases. - if (auto *CI = dyn_cast(C)) { + if (isa(C)) { // A <= MAX -> TRUE ; A >= MIN -> TRUE - assert(ConstantIsOk(CI)); + assert(ConstantIsOk(cast(C))); } else if (Type->isVectorTy()) { // TODO? If the edge cases for vectors were guaranteed to be handled as they // are for scalar, we could remove the min/max checks. However, to do that, diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll new file mode 100644 index 0000000000000..59518c85fde78 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -0,0 +1,1143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { +; CHECK-LABEL: foo_v4i32_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrne r3, [r2] +; CHECK-NEXT: vmovne.32 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.32 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r3, [r2, #8] +; CHECK-NEXT: vmovmi.32 q0[2], r3 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r1, [r2, #12] +; CHECK-NEXT: vmovmi.32 q0[3], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne r2, s0 +; CHECK-NEXT: strne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s1 +; CHECK-NEXT: strmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s2 +; CHECK-NEXT: strmi r2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r1, s3 +; CHECK-NEXT: strmi r1, [r0, #12] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) + call void @llvm.masked.store.v4i32(<4 x i32> %2, <4 x i32>* %dest, i32 4, <4 x i1> %1) + ret void +} + +define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { +; CHECK-LABEL: foo_sext_v4i32_v4i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrbne r3, [r2] +; CHECK-NEXT: vmovne.32 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #1] +; CHECK-NEXT: vmovmi.32 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.32 q0[2], r3 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r1, [r2, #3] +; CHECK-NEXT: vmovmi.32 q0[3], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne r2, s0 +; CHECK-NEXT: strne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s1 +; CHECK-NEXT: strmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s2 +; CHECK-NEXT: strmi r2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r1, s3 +; CHECK-NEXT: strmi r1, [r0, #12] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef) + %3 = sext <4 x i8> %2 to <4 x i32> + call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + ret void +} + +define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { +; CHECK-LABEL: foo_sext_v4i32_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrhne r3, [r2] +; CHECK-NEXT: vmovne.32 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.32 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.32 q0[2], r3 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r1, [r2, #6] +; CHECK-NEXT: vmovmi.32 q0[3], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne r2, s0 +; CHECK-NEXT: strne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s1 +; CHECK-NEXT: strmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s2 +; CHECK-NEXT: strmi r2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r1, s3 +; CHECK-NEXT: strmi r1, [r0, #12] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef) + %3 = sext <4 x i16> %2 to <4 x i32> + call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + ret void +} + +define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) { +; CHECK-LABEL: foo_zext_v4i32_v4i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vmov.i32 q1, #0xff +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrbne r3, [r2] +; CHECK-NEXT: vmovne.32 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #1] +; CHECK-NEXT: vmovmi.32 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.32 q0[2], r3 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r1, [r2, #3] +; CHECK-NEXT: vmovmi.32 q0[3], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne r2, s0 +; CHECK-NEXT: strne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s1 +; CHECK-NEXT: strmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s2 +; CHECK-NEXT: strmi r2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r1, s3 +; CHECK-NEXT: strmi r1, [r0, #12] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef) + %3 = zext <4 x i8> %2 to <4 x i32> + call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + ret void +} + +define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) { +; CHECK-LABEL: foo_zext_v4i32_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrhne r3, [r2] +; CHECK-NEXT: vmovne.32 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.32 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.32 q0[2], r3 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r1, [r2, #6] +; CHECK-NEXT: vmovmi.32 q0[3], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne r2, s0 +; CHECK-NEXT: strne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s1 +; CHECK-NEXT: strmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s2 +; CHECK-NEXT: strmi r2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r1, s3 +; CHECK-NEXT: strmi r1, [r0, #12] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef) + %3 = zext <4 x i16> %2 to <4 x i32> + call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + ret void +} + +define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src) { +; CHECK-LABEL: foo_v8i16_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: vcmp.s16 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #8] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrhne r3, [r2] +; CHECK-NEXT: vmovne.16 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.16 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.16 q0[2], r3 +; CHECK-NEXT: lsls r3, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #6] +; CHECK-NEXT: vmovmi.16 q0[3], r3 +; CHECK-NEXT: lsls r3, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #8] +; CHECK-NEXT: vmovmi.16 q0[4], r3 +; CHECK-NEXT: lsls r3, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #10] +; CHECK-NEXT: vmovmi.16 q0[5], r3 +; CHECK-NEXT: lsls r3, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #12] +; CHECK-NEXT: vmovmi.16 q0[6], r3 +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r1, [r2, #14] +; CHECK-NEXT: vmovmi.16 q0[7], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne.u16 r2, q0[0] +; CHECK-NEXT: strhne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-NEXT: strhmi r2, [r0, #2] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-NEXT: strhmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-NEXT: strhmi r2, [r0, #6] +; CHECK-NEXT: lsls r2, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-NEXT: strhmi r2, [r0, #8] +; CHECK-NEXT: lsls r2, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-NEXT: strhmi r2, [r0, #10] +; CHECK-NEXT: lsls r2, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-NEXT: strhmi r2, [r0, #12] +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-NEXT: strhmi r1, [r0, #14] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr +entry: + %0 = load <8 x i16>, <8 x i16>* %mask, align 2 + %1 = icmp sgt <8 x i16> %0, zeroinitializer + %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef) + call void @llvm.masked.store.v8i16(<8 x i16> %2, <8 x i16>* %dest, i32 2, <8 x i1> %1) + ret void +} + +define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { +; CHECK-LABEL: foo_sext_v8i16_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: vcmp.s16 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #8] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrbne r3, [r2] +; CHECK-NEXT: vmovne.16 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #1] +; CHECK-NEXT: vmovmi.16 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.16 q0[2], r3 +; CHECK-NEXT: lsls r3, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #3] +; CHECK-NEXT: vmovmi.16 q0[3], r3 +; CHECK-NEXT: lsls r3, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.16 q0[4], r3 +; CHECK-NEXT: lsls r3, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #5] +; CHECK-NEXT: vmovmi.16 q0[5], r3 +; CHECK-NEXT: lsls r3, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #6] +; CHECK-NEXT: vmovmi.16 q0[6], r3 +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r1, [r2, #7] +; CHECK-NEXT: vmovmi.16 q0[7], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne.u16 r2, q0[0] +; CHECK-NEXT: strhne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-NEXT: strhmi r2, [r0, #2] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-NEXT: strhmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-NEXT: strhmi r2, [r0, #6] +; CHECK-NEXT: lsls r2, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-NEXT: strhmi r2, [r0, #8] +; CHECK-NEXT: lsls r2, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-NEXT: strhmi r2, [r0, #10] +; CHECK-NEXT: lsls r2, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-NEXT: strhmi r2, [r0, #12] +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-NEXT: strhmi r1, [r0, #14] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr +entry: + %0 = load <8 x i16>, <8 x i16>* %mask, align 2 + %1 = icmp sgt <8 x i16> %0, zeroinitializer + %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef) + %3 = sext <8 x i8> %2 to <8 x i16> + call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1) + ret void +} + +define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) { +; CHECK-LABEL: foo_zext_v8i16_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: vcmp.s16 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #8] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrbne r3, [r2] +; CHECK-NEXT: vmovne.16 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #1] +; CHECK-NEXT: vmovmi.16 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.16 q0[2], r3 +; CHECK-NEXT: lsls r3, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #3] +; CHECK-NEXT: vmovmi.16 q0[3], r3 +; CHECK-NEXT: lsls r3, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.16 q0[4], r3 +; CHECK-NEXT: lsls r3, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #5] +; CHECK-NEXT: vmovmi.16 q0[5], r3 +; CHECK-NEXT: lsls r3, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #6] +; CHECK-NEXT: vmovmi.16 q0[6], r3 +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r1, [r2, #7] +; CHECK-NEXT: vmovmi.16 q0[7], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne.u16 r2, q0[0] +; CHECK-NEXT: strhne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-NEXT: strhmi r2, [r0, #2] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-NEXT: strhmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-NEXT: strhmi r2, [r0, #6] +; CHECK-NEXT: lsls r2, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-NEXT: strhmi r2, [r0, #8] +; CHECK-NEXT: lsls r2, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-NEXT: strhmi r2, [r0, #10] +; CHECK-NEXT: lsls r2, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-NEXT: strhmi r2, [r0, #12] +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-NEXT: strhmi r1, [r0, #14] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr +entry: + %0 = load <8 x i16>, <8 x i16>* %mask, align 2 + %1 = icmp sgt <8 x i16> %0, zeroinitializer + %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef) + %3 = zext <8 x i8> %2 to <8 x i16> + call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1) + ret void +} + +define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src) { +; CHECK-LABEL: foo_v16i8_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #8 +; CHECK-NEXT: add r7, sp, #8 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: bfc r4, #0, #4 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: add r3, sp, #16 +; CHECK-NEXT: sub.w r4, r7, #8 +; CHECK-NEXT: vcmp.s8 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrh.w r1, [sp, #16] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrbne r3, [r2] +; CHECK-NEXT: vmovne.8 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #1] +; CHECK-NEXT: vmovmi.8 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.8 q0[2], r3 +; CHECK-NEXT: lsls r3, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #3] +; CHECK-NEXT: vmovmi.8 q0[3], r3 +; CHECK-NEXT: lsls r3, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.8 q0[4], r3 +; CHECK-NEXT: lsls r3, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #5] +; CHECK-NEXT: vmovmi.8 q0[5], r3 +; CHECK-NEXT: lsls r3, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #6] +; CHECK-NEXT: vmovmi.8 q0[6], r3 +; CHECK-NEXT: lsls r3, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #7] +; CHECK-NEXT: vmovmi.8 q0[7], r3 +; CHECK-NEXT: lsls r3, r1, #23 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #8] +; CHECK-NEXT: vmovmi.8 q0[8], r3 +; CHECK-NEXT: lsls r3, r1, #22 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #9] +; CHECK-NEXT: vmovmi.8 q0[9], r3 +; CHECK-NEXT: lsls r3, r1, #21 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #10] +; CHECK-NEXT: vmovmi.8 q0[10], r3 +; CHECK-NEXT: lsls r3, r1, #20 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #11] +; CHECK-NEXT: vmovmi.8 q0[11], r3 +; CHECK-NEXT: lsls r3, r1, #19 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #12] +; CHECK-NEXT: vmovmi.8 q0[12], r3 +; CHECK-NEXT: lsls r3, r1, #18 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #13] +; CHECK-NEXT: vmovmi.8 q0[13], r3 +; CHECK-NEXT: lsls r3, r1, #17 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r3, [r2, #14] +; CHECK-NEXT: vmovmi.8 q0[14], r3 +; CHECK-NEXT: lsls r1, r1, #16 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrbmi r1, [r2, #15] +; CHECK-NEXT: vmovmi.8 q0[15], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrh.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne.u8 r2, q0[0] +; CHECK-NEXT: strbne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[1] +; CHECK-NEXT: strbmi r2, [r0, #1] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[2] +; CHECK-NEXT: strbmi r2, [r0, #2] +; CHECK-NEXT: lsls r2, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[3] +; CHECK-NEXT: strbmi r2, [r0, #3] +; CHECK-NEXT: lsls r2, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[4] +; CHECK-NEXT: strbmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[5] +; CHECK-NEXT: strbmi r2, [r0, #5] +; CHECK-NEXT: lsls r2, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[6] +; CHECK-NEXT: strbmi r2, [r0, #6] +; CHECK-NEXT: lsls r2, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[7] +; CHECK-NEXT: strbmi r2, [r0, #7] +; CHECK-NEXT: lsls r2, r1, #23 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[8] +; CHECK-NEXT: strbmi r2, [r0, #8] +; CHECK-NEXT: lsls r2, r1, #22 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[9] +; CHECK-NEXT: strbmi r2, [r0, #9] +; CHECK-NEXT: lsls r2, r1, #21 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[10] +; CHECK-NEXT: strbmi r2, [r0, #10] +; CHECK-NEXT: lsls r2, r1, #20 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[11] +; CHECK-NEXT: strbmi r2, [r0, #11] +; CHECK-NEXT: lsls r2, r1, #19 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[12] +; CHECK-NEXT: strbmi r2, [r0, #12] +; CHECK-NEXT: lsls r2, r1, #18 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[13] +; CHECK-NEXT: strbmi r2, [r0, #13] +; CHECK-NEXT: lsls r2, r1, #17 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r2, q0[14] +; CHECK-NEXT: strbmi r2, [r0, #14] +; CHECK-NEXT: lsls r1, r1, #16 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u8 r1, q0[15] +; CHECK-NEXT: strbmi r1, [r0, #15] +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r6, r7, pc} +entry: + %0 = load <16 x i8>, <16 x i8>* %mask, align 1 + %1 = icmp sgt <16 x i8> %0, zeroinitializer + %2 = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %src, i32 1, <16 x i1> %1, <16 x i8> undef) + call void @llvm.masked.store.v16i8(<16 x i8> %2, <16 x i8>* %dest, i32 1, <16 x i1> %1) + ret void +} + +define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) { +; CHECK-LABEL: foo_trunc_v8i8_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: vcmp.s16 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #8] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrhne r3, [r2] +; CHECK-NEXT: vmovne.16 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #2] +; CHECK-NEXT: vmovmi.16 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.16 q0[2], r3 +; CHECK-NEXT: lsls r3, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #6] +; CHECK-NEXT: vmovmi.16 q0[3], r3 +; CHECK-NEXT: lsls r3, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #8] +; CHECK-NEXT: vmovmi.16 q0[4], r3 +; CHECK-NEXT: lsls r3, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #10] +; CHECK-NEXT: vmovmi.16 q0[5], r3 +; CHECK-NEXT: lsls r3, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r3, [r2, #12] +; CHECK-NEXT: vmovmi.16 q0[6], r3 +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrhmi r1, [r2, #14] +; CHECK-NEXT: vmovmi.16 q0[7], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne.u16 r2, q0[0] +; CHECK-NEXT: strbne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-NEXT: strbmi r2, [r0, #1] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-NEXT: strbmi r2, [r0, #2] +; CHECK-NEXT: lsls r2, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-NEXT: strbmi r2, [r0, #3] +; CHECK-NEXT: lsls r2, r1, #27 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-NEXT: strbmi r2, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #26 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-NEXT: strbmi r2, [r0, #5] +; CHECK-NEXT: lsls r2, r1, #25 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-NEXT: strbmi r2, [r0, #6] +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-NEXT: strbmi r1, [r0, #7] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr +entry: + %0 = load <8 x i16>, <8 x i16>* %mask, align 2 + %1 = icmp sgt <8 x i16> %0, zeroinitializer + %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef) + %3 = trunc <8 x i16> %2 to <8 x i8> + call void @llvm.masked.store.v8i8(<8 x i8> %3, <8 x i8>* %dest, i32 1, <8 x i1> %1) + ret void +} + +define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { +; CHECK-LABEL: foo_trunc_v4i8_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrne r3, [r2] +; CHECK-NEXT: vmovne.32 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.32 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r3, [r2, #8] +; CHECK-NEXT: vmovmi.32 q0[2], r3 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r1, [r2, #12] +; CHECK-NEXT: vmovmi.32 q0[3], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne r2, s0 +; CHECK-NEXT: strbne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s1 +; CHECK-NEXT: strbmi r2, [r0, #1] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s2 +; CHECK-NEXT: strbmi r2, [r0, #2] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r1, s3 +; CHECK-NEXT: strbmi r1, [r0, #3] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = trunc <4 x i32> %2 to <4 x i8> + call void @llvm.masked.store.v4i8(<4 x i8> %3, <4 x i8>* %dest, i32 1, <4 x i1> %1) + ret void +} + +define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) { +; CHECK-LABEL: foo_trunc_v4i16_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: ldrne r3, [r2] +; CHECK-NEXT: vmovne.32 q0[0], r3 +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r3, [r2, #4] +; CHECK-NEXT: vmovmi.32 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r3, [r2, #8] +; CHECK-NEXT: vmovmi.32 q0[2], r3 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: ldrmi r1, [r2, #12] +; CHECK-NEXT: vmovmi.32 q0[3], r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: vmovne r2, s0 +; CHECK-NEXT: strhne r2, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s1 +; CHECK-NEXT: strhmi r2, [r0, #2] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r2, s2 +; CHECK-NEXT: strhmi r2, [r0, #4] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: itt mi +; CHECK-NEXT: vmovmi r1, s3 +; CHECK-NEXT: strhmi r1, [r0, #6] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = trunc <4 x i32> %2 to <4 x i16> + call void @llvm.masked.store.v4i16(<4 x i16> %3, <4 x i16>* %dest, i32 2, <4 x i1> %1) + ret void +} + +define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *%src) { +; CHECK-LABEL: foo_v4f32_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: it ne +; CHECK-NEXT: vldrne s0, [r2] +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: it mi +; CHECK-NEXT: vldrmi s1, [r2, #4] +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: it mi +; CHECK-NEXT: vldrmi s2, [r2, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: it mi +; CHECK-NEXT: vldrmi s3, [r2, #12] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: it ne +; CHECK-NEXT: vstrne s0, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: it mi +; CHECK-NEXT: vstrmi s1, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: it mi +; CHECK-NEXT: vstrmi s2, [r0, #8] +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: it mi +; CHECK-NEXT: vstrmi s3, [r0, #12] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %mask, align 4 + %1 = icmp sgt <4 x i32> %0, zeroinitializer + %2 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %src, i32 4, <4 x i1> %1, <4 x float> undef) + call void @llvm.masked.store.v4f32(<4 x float> %2, <4 x float>* %dest, i32 4, <4 x i1> %1) + ret void +} + +define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%src) { +; CHECK-LABEL: foo_v8f16_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: vcmp.s16 gt, q0, zr +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: vstr p0, [r3] +; CHECK-NEXT: ldrb.w r1, [sp, #8] +; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: bne .LBB13_18 +; CHECK-NEXT: @ %bb.1: @ %else +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: bmi .LBB13_19 +; CHECK-NEXT: .LBB13_2: @ %else2 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: bmi .LBB13_20 +; CHECK-NEXT: .LBB13_3: @ %else5 +; CHECK-NEXT: lsls r3, r1, #28 +; CHECK-NEXT: bmi .LBB13_21 +; CHECK-NEXT: .LBB13_4: @ %else8 +; CHECK-NEXT: lsls r3, r1, #27 +; CHECK-NEXT: bmi .LBB13_22 +; CHECK-NEXT: .LBB13_5: @ %else11 +; CHECK-NEXT: lsls r3, r1, #26 +; CHECK-NEXT: bmi .LBB13_23 +; CHECK-NEXT: .LBB13_6: @ %else14 +; CHECK-NEXT: lsls r3, r1, #25 +; CHECK-NEXT: bmi .LBB13_24 +; CHECK-NEXT: .LBB13_7: @ %else17 +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: bpl .LBB13_9 +; CHECK-NEXT: .LBB13_8: @ %cond.load19 +; CHECK-NEXT: vldr.16 s4, [r2, #14] +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: .LBB13_9: @ %else20 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstr p0, [r1] +; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: bne .LBB13_25 +; CHECK-NEXT: @ %bb.10: @ %else23 +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: bmi .LBB13_26 +; CHECK-NEXT: .LBB13_11: @ %else25 +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: bmi .LBB13_27 +; CHECK-NEXT: .LBB13_12: @ %else27 +; CHECK-NEXT: lsls r2, r1, #28 +; CHECK-NEXT: bmi .LBB13_28 +; CHECK-NEXT: .LBB13_13: @ %else29 +; CHECK-NEXT: lsls r2, r1, #27 +; CHECK-NEXT: bmi .LBB13_29 +; CHECK-NEXT: .LBB13_14: @ %else31 +; CHECK-NEXT: lsls r2, r1, #26 +; CHECK-NEXT: bmi .LBB13_30 +; CHECK-NEXT: .LBB13_15: @ %else33 +; CHECK-NEXT: lsls r2, r1, #25 +; CHECK-NEXT: bmi .LBB13_31 +; CHECK-NEXT: .LBB13_16: @ %else35 +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: bmi .LBB13_32 +; CHECK-NEXT: .LBB13_17: @ %else37 +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .LBB13_18: @ %cond.load +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: lsls r3, r1, #30 +; CHECK-NEXT: bpl .LBB13_2 +; CHECK-NEXT: .LBB13_19: @ %cond.load1 +; CHECK-NEXT: vldr.16 s4, [r2, #2] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: lsls r3, r1, #29 +; CHECK-NEXT: bpl .LBB13_3 +; CHECK-NEXT: .LBB13_20: @ %cond.load4 +; CHECK-NEXT: vldr.16 s4, [r2, #4] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: lsls r3, r1, #28 +; CHECK-NEXT: bpl .LBB13_4 +; CHECK-NEXT: .LBB13_21: @ %cond.load7 +; CHECK-NEXT: vldr.16 s4, [r2, #6] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: lsls r3, r1, #27 +; CHECK-NEXT: bpl .LBB13_5 +; CHECK-NEXT: .LBB13_22: @ %cond.load10 +; CHECK-NEXT: vldr.16 s4, [r2, #8] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: lsls r3, r1, #26 +; CHECK-NEXT: bpl .LBB13_6 +; CHECK-NEXT: .LBB13_23: @ %cond.load13 +; CHECK-NEXT: vldr.16 s4, [r2, #10] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: lsls r3, r1, #25 +; CHECK-NEXT: bpl .LBB13_7 +; CHECK-NEXT: .LBB13_24: @ %cond.load16 +; CHECK-NEXT: vldr.16 s4, [r2, #12] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: bmi .LBB13_8 +; CHECK-NEXT: b .LBB13_9 +; CHECK-NEXT: .LBB13_25: @ %cond.store +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: lsls r2, r1, #30 +; CHECK-NEXT: bpl .LBB13_11 +; CHECK-NEXT: .LBB13_26: @ %cond.store24 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vstr.16 s4, [r0, #2] +; CHECK-NEXT: lsls r2, r1, #29 +; CHECK-NEXT: bpl .LBB13_12 +; CHECK-NEXT: .LBB13_27: @ %cond.store26 +; CHECK-NEXT: vstr.16 s1, [r0, #4] +; CHECK-NEXT: lsls r2, r1, #28 +; CHECK-NEXT: bpl .LBB13_13 +; CHECK-NEXT: .LBB13_28: @ %cond.store28 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vstr.16 s4, [r0, #6] +; CHECK-NEXT: lsls r2, r1, #27 +; CHECK-NEXT: bpl .LBB13_14 +; CHECK-NEXT: .LBB13_29: @ %cond.store30 +; CHECK-NEXT: vstr.16 s2, [r0, #8] +; CHECK-NEXT: lsls r2, r1, #26 +; CHECK-NEXT: bpl .LBB13_15 +; CHECK-NEXT: .LBB13_30: @ %cond.store32 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vstr.16 s4, [r0, #10] +; CHECK-NEXT: lsls r2, r1, #25 +; CHECK-NEXT: bpl .LBB13_16 +; CHECK-NEXT: .LBB13_31: @ %cond.store34 +; CHECK-NEXT: vstr.16 s3, [r0, #12] +; CHECK-NEXT: lsls r1, r1, #24 +; CHECK-NEXT: bpl .LBB13_17 +; CHECK-NEXT: .LBB13_32: @ %cond.store36 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vstr.16 s0, [r0, #14] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr +entry: + %0 = load <8 x i16>, <8 x i16>* %mask, align 2 + %1 = icmp sgt <8 x i16> %0, zeroinitializer + %2 = call <8 x half> @llvm.masked.load.v8f16(<8 x half>* %src, i32 2, <8 x i1> %1, <8 x half> undef) + call void @llvm.masked.store.v8f16(<8 x half> %2, <8 x half>* %dest, i32 2, <8 x i1> %1) + ret void +} + +declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) +declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <8 x half> @llvm.masked.load.v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) + +declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) +declare <4 x i16> @llvm.masked.load.v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <4 x i8> @llvm.masked.load.v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll new file mode 100644 index 0000000000000..cf04e235fdb42 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -0,0 +1,3597 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_zero(<4 x i32> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_align4_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: beq .LBB0_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: ldr r3, [r0] +; CHECK-LE-NEXT: vdup.32 q0, r2 +; CHECK-LE-NEXT: vmov.32 q0[0], r3 +; CHECK-LE-NEXT: b .LBB0_3 +; CHECK-LE-NEXT: .LBB0_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: .LBB0_3: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_align4_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: beq .LBB0_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: ldr r3, [r0] +; CHECK-BE-NEXT: vdup.32 q1, r2 +; CHECK-BE-NEXT: vmov.32 q1[0], r3 +; CHECK-BE-NEXT: b .LBB0_3 +; CHECK-BE-NEXT: .LBB0_2: +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: .LBB0_3: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.32 q1[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi.32 q1[3], r0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %dest, i32 4, <4 x i1> %c, <4 x i32> zeroinitializer) + ret <4 x i32> %l +} + +define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_undef(<4 x i32> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_align4_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r2, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_align4_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r2, [r0] +; CHECK-BE-NEXT: vmovne.32 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.32 q1[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi.32 q1[3], r0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %dest, i32 4, <4 x i1> %c, <4 x i32> undef) + ret <4 x i32> %l +} + +define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r2, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r2, [r0] +; CHECK-BE-NEXT: vmovne.32 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.32 q1[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi.32 q1[3], r0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %dest, i32 1, <4 x i1> %c, <4 x i32> undef) + ret <4 x i32> %l +} + +define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_other(<4 x i32> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_align4_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r2, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_align4_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r2, [r0] +; CHECK-BE-NEXT: vmovne.32 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.32 q1[2], r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi.32 q1[3], r0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %dest, i32 4, <4 x i1> %c, <4 x i32> %a) + ret <4 x i32> %l +} + +define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_preinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r3, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r3 +; CHECK-LE-NEXT: lsls r3, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r3, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r3 +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r3, [r0, #8] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r3 +; CHECK-LE-NEXT: lsls r2, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r2 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_preinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r3, [r0] +; CHECK-BE-NEXT: vmovne.32 q0[0], r3 +; CHECK-BE-NEXT: lsls r3, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r3, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q0[1], r3 +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r3, [r0, #8] +; CHECK-BE-NEXT: vmovmi.32 q0[2], r3 +; CHECK-BE-NEXT: lsls r2, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.32 q0[3], r2 +; CHECK-BE-NEXT: vstrw.32 q0, [r1] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i32>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_postinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: add.w r12, r0, #4 +; CHECK-LE-NEXT: ldrb.w r3, [sp] +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r2, [r0] +; CHECK-LE-NEXT: vmovne.32 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r3, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r3, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r3, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: mov r0, r12 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_postinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: add.w r12, r0, #4 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r3, [sp] +; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r2, [r0] +; CHECK-BE-NEXT: vmovne.32 q0[0], r2 +; CHECK-BE-NEXT: lsls r2, r3, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.32 q0[1], r2 +; CHECK-BE-NEXT: lsls r2, r3, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.32 q0[2], r2 +; CHECK-BE-NEXT: lsls r2, r3, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi.32 q0[3], r0 +; CHECK-BE-NEXT: vstrw.32 q0, [r1] +; CHECK-BE-NEXT: mov r0, r12 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + + + +define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_zero(<8 x i16> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_align4_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: beq .LBB6_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: ldrh r3, [r0] +; CHECK-LE-NEXT: vdup.16 q0, r2 +; CHECK-LE-NEXT: vmov.16 q0[0], r3 +; CHECK-LE-NEXT: b .LBB6_3 +; CHECK-LE-NEXT: .LBB6_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: .LBB6_3: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_align4_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: beq .LBB6_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: ldrh r3, [r0] +; CHECK-BE-NEXT: vdup.16 q1, r2 +; CHECK-BE-NEXT: vmov.16 q1[0], r3 +; CHECK-BE-NEXT: b .LBB6_3 +; CHECK-BE-NEXT: .LBB6_2: +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vrev32.16 q1, q0 +; CHECK-BE-NEXT: .LBB6_3: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-BE-NEXT: vmovmi.16 q1[7], r0 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %dest, i32 2, <8 x i1> %c, <8 x i16> zeroinitializer) + ret <8 x i16> %l +} + +define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_undef(<8 x i16> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_align4_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r2, [r0] +; CHECK-LE-NEXT: vmovne.16 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_align4_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r2, [r0] +; CHECK-BE-NEXT: vmovne.16 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-BE-NEXT: vmovmi.16 q1[7], r0 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %dest, i32 2, <8 x i1> %c, <8 x i16> undef) + ret <8 x i16> %l +} + +define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r2, [r0] +; CHECK-LE-NEXT: vmovne.16 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r2, [r0] +; CHECK-BE-NEXT: vmovne.16 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-BE-NEXT: vmovmi.16 q1[7], r0 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %dest, i32 1, <8 x i1> %c, <8 x i16> undef) + ret <8 x i16> %l +} + +define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_other(<8 x i16> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_align4_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r2, [r0] +; CHECK-LE-NEXT: vmovne.16 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_align4_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r2, [r0] +; CHECK-BE-NEXT: vmovne.16 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-BE-NEXT: vmovmi.16 q1[7], r0 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %dest, i32 2, <8 x i1> %c, <8 x i16> %a) + ret <8 x i16> %l +} + +define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_preinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r3, [r0] +; CHECK-LE-NEXT: vmovne.16 q0[0], r3 +; CHECK-LE-NEXT: lsls r3, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r3, [r0, #2] +; CHECK-LE-NEXT: vmovmi.16 q0[1], r3 +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r3, [r0, #4] +; CHECK-LE-NEXT: vmovmi.16 q0[2], r3 +; CHECK-LE-NEXT: lsls r3, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r3, [r0, #6] +; CHECK-LE-NEXT: vmovmi.16 q0[3], r3 +; CHECK-LE-NEXT: lsls r3, r2, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r3, [r0, #8] +; CHECK-LE-NEXT: vmovmi.16 q0[4], r3 +; CHECK-LE-NEXT: lsls r3, r2, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r3, [r0, #10] +; CHECK-LE-NEXT: vmovmi.16 q0[5], r3 +; CHECK-LE-NEXT: lsls r3, r2, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r3, [r0, #12] +; CHECK-LE-NEXT: vmovmi.16 q0[6], r3 +; CHECK-LE-NEXT: lsls r2, r2, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #14] +; CHECK-LE-NEXT: vmovmi.16 q0[7], r2 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_preinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r3, [r0] +; CHECK-BE-NEXT: vmovne.16 q0[0], r3 +; CHECK-BE-NEXT: lsls r3, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r3, [r0, #2] +; CHECK-BE-NEXT: vmovmi.16 q0[1], r3 +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r3, [r0, #4] +; CHECK-BE-NEXT: vmovmi.16 q0[2], r3 +; CHECK-BE-NEXT: lsls r3, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r3, [r0, #6] +; CHECK-BE-NEXT: vmovmi.16 q0[3], r3 +; CHECK-BE-NEXT: lsls r3, r2, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r3, [r0, #8] +; CHECK-BE-NEXT: vmovmi.16 q0[4], r3 +; CHECK-BE-NEXT: lsls r3, r2, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r3, [r0, #10] +; CHECK-BE-NEXT: vmovmi.16 q0[5], r3 +; CHECK-BE-NEXT: lsls r3, r2, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r3, [r0, #12] +; CHECK-BE-NEXT: vmovmi.16 q0[6], r3 +; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #14] +; CHECK-BE-NEXT: vmovmi.16 q0[7], r2 +; CHECK-BE-NEXT: vstrh.16 q0, [r1] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i16>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 4, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 4 + ret i8* %z +} + +define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_postinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: add.w r12, r0, #4 +; CHECK-LE-NEXT: ldrb.w r3, [sp] +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrhne r2, [r0] +; CHECK-LE-NEXT: vmovne.16 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r3, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r3, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r3, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r3, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r3, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r3, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r2, r3, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: mov r0, r12 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_postinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: add.w r12, r0, #4 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r3, [sp] +; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrhne r2, [r0] +; CHECK-BE-NEXT: vmovne.16 q0[0], r2 +; CHECK-BE-NEXT: lsls r2, r3, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.16 q0[1], r2 +; CHECK-BE-NEXT: lsls r2, r3, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.16 q0[2], r2 +; CHECK-BE-NEXT: lsls r2, r3, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.16 q0[3], r2 +; CHECK-BE-NEXT: lsls r2, r3, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.16 q0[4], r2 +; CHECK-BE-NEXT: lsls r2, r3, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.16 q0[5], r2 +; CHECK-BE-NEXT: lsls r2, r3, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.16 q0[6], r2 +; CHECK-BE-NEXT: lsls r2, r3, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrhmi r0, [r0, #14] +; CHECK-BE-NEXT: vmovmi.16 q0[7], r0 +; CHECK-BE-NEXT: vstrh.16 q0, [r1] +; CHECK-BE-NEXT: mov r0, r12 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 4, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 4 + ret i8* %z +} + + +define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_zero(<16 x i8> *%dest, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8_align4_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrh.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: beq .LBB12_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: ldrb r3, [r0] +; CHECK-LE-NEXT: vdup.8 q0, r2 +; CHECK-LE-NEXT: vmov.8 q0[0], r3 +; CHECK-LE-NEXT: b .LBB12_3 +; CHECK-LE-NEXT: .LBB12_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: .LBB12_3: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-LE-NEXT: vmovmi.8 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.8 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-LE-NEXT: vmovmi.8 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.8 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-LE-NEXT: vmovmi.8 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.8 q0[6], r2 +; CHECK-LE-NEXT: lsls r2, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-LE-NEXT: vmovmi.8 q0[7], r2 +; CHECK-LE-NEXT: lsls r2, r1, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.8 q0[8], r2 +; CHECK-LE-NEXT: lsls r2, r1, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-LE-NEXT: vmovmi.8 q0[9], r2 +; CHECK-LE-NEXT: lsls r2, r1, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.8 q0[10], r2 +; CHECK-LE-NEXT: lsls r2, r1, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-LE-NEXT: vmovmi.8 q0[11], r2 +; CHECK-LE-NEXT: lsls r2, r1, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.8 q0[12], r2 +; CHECK-LE-NEXT: lsls r2, r1, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-LE-NEXT: vmovmi.8 q0[13], r2 +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: lsls r2, r1, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-LE-NEXT: vmovmi.8 q0[14], r2 +; CHECK-LE-NEXT: lsls r1, r1, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-LE-NEXT: vmovmi.8 q0[15], r0 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8_align4_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrh.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: beq .LBB12_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: ldrb r3, [r0] +; CHECK-BE-NEXT: vdup.8 q1, r2 +; CHECK-BE-NEXT: vmov.8 q1[0], r3 +; CHECK-BE-NEXT: b .LBB12_3 +; CHECK-BE-NEXT: .LBB12_2: +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vrev32.8 q1, q0 +; CHECK-BE-NEXT: .LBB12_3: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-BE-NEXT: vmovmi.8 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.8 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-BE-NEXT: vmovmi.8 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.8 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-BE-NEXT: vmovmi.8 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.8 q1[6], r2 +; CHECK-BE-NEXT: lsls r2, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-BE-NEXT: vmovmi.8 q1[7], r2 +; CHECK-BE-NEXT: lsls r2, r1, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.8 q1[8], r2 +; CHECK-BE-NEXT: lsls r2, r1, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-BE-NEXT: vmovmi.8 q1[9], r2 +; CHECK-BE-NEXT: lsls r2, r1, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.8 q1[10], r2 +; CHECK-BE-NEXT: lsls r2, r1, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-BE-NEXT: vmovmi.8 q1[11], r2 +; CHECK-BE-NEXT: lsls r2, r1, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.8 q1[12], r2 +; CHECK-BE-NEXT: lsls r2, r1, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-BE-NEXT: vmovmi.8 q1[13], r2 +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: lsls r2, r1, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-BE-NEXT: vmovmi.8 q1[14], r2 +; CHECK-BE-NEXT: lsls r1, r1, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-BE-NEXT: vmovmi.8 q1[15], r0 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %c = icmp sgt <16 x i8> %a, zeroinitializer + %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> zeroinitializer) + ret <16 x i8> %l +} + +define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_undef(<16 x i8> *%dest, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8_align4_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: ldrh.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrbne r2, [r0] +; CHECK-LE-NEXT: vmovne.8 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-LE-NEXT: vmovmi.8 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.8 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-LE-NEXT: vmovmi.8 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.8 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-LE-NEXT: vmovmi.8 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.8 q0[6], r2 +; CHECK-LE-NEXT: lsls r2, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-LE-NEXT: vmovmi.8 q0[7], r2 +; CHECK-LE-NEXT: lsls r2, r1, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.8 q0[8], r2 +; CHECK-LE-NEXT: lsls r2, r1, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-LE-NEXT: vmovmi.8 q0[9], r2 +; CHECK-LE-NEXT: lsls r2, r1, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.8 q0[10], r2 +; CHECK-LE-NEXT: lsls r2, r1, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-LE-NEXT: vmovmi.8 q0[11], r2 +; CHECK-LE-NEXT: lsls r2, r1, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.8 q0[12], r2 +; CHECK-LE-NEXT: lsls r2, r1, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-LE-NEXT: vmovmi.8 q0[13], r2 +; CHECK-LE-NEXT: lsls r2, r1, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-LE-NEXT: vmovmi.8 q0[14], r2 +; CHECK-LE-NEXT: lsls r1, r1, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-LE-NEXT: vmovmi.8 q0[15], r0 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8_align4_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrh.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrbne r2, [r0] +; CHECK-BE-NEXT: vmovne.8 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-BE-NEXT: vmovmi.8 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.8 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-BE-NEXT: vmovmi.8 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.8 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-BE-NEXT: vmovmi.8 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.8 q1[6], r2 +; CHECK-BE-NEXT: lsls r2, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-BE-NEXT: vmovmi.8 q1[7], r2 +; CHECK-BE-NEXT: lsls r2, r1, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.8 q1[8], r2 +; CHECK-BE-NEXT: lsls r2, r1, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-BE-NEXT: vmovmi.8 q1[9], r2 +; CHECK-BE-NEXT: lsls r2, r1, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.8 q1[10], r2 +; CHECK-BE-NEXT: lsls r2, r1, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-BE-NEXT: vmovmi.8 q1[11], r2 +; CHECK-BE-NEXT: lsls r2, r1, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.8 q1[12], r2 +; CHECK-BE-NEXT: lsls r2, r1, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-BE-NEXT: vmovmi.8 q1[13], r2 +; CHECK-BE-NEXT: lsls r2, r1, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-BE-NEXT: vmovmi.8 q1[14], r2 +; CHECK-BE-NEXT: lsls r1, r1, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-BE-NEXT: vmovmi.8 q1[15], r0 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %c = icmp sgt <16 x i8> %a, zeroinitializer + %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> undef) + ret <16 x i8> %l +} + +define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_other(<16 x i8> *%dest, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8_align4_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: ldrh.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrbne r2, [r0] +; CHECK-LE-NEXT: vmovne.8 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-LE-NEXT: vmovmi.8 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.8 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-LE-NEXT: vmovmi.8 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.8 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-LE-NEXT: vmovmi.8 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.8 q0[6], r2 +; CHECK-LE-NEXT: lsls r2, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-LE-NEXT: vmovmi.8 q0[7], r2 +; CHECK-LE-NEXT: lsls r2, r1, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.8 q0[8], r2 +; CHECK-LE-NEXT: lsls r2, r1, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-LE-NEXT: vmovmi.8 q0[9], r2 +; CHECK-LE-NEXT: lsls r2, r1, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.8 q0[10], r2 +; CHECK-LE-NEXT: lsls r2, r1, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-LE-NEXT: vmovmi.8 q0[11], r2 +; CHECK-LE-NEXT: lsls r2, r1, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.8 q0[12], r2 +; CHECK-LE-NEXT: lsls r2, r1, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-LE-NEXT: vmovmi.8 q0[13], r2 +; CHECK-LE-NEXT: lsls r2, r1, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-LE-NEXT: vmovmi.8 q0[14], r2 +; CHECK-LE-NEXT: lsls r1, r1, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-LE-NEXT: vmovmi.8 q0[15], r0 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8_align4_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrh.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrbne r2, [r0] +; CHECK-BE-NEXT: vmovne.8 q1[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-BE-NEXT: vmovmi.8 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.8 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-BE-NEXT: vmovmi.8 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.8 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-BE-NEXT: vmovmi.8 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.8 q1[6], r2 +; CHECK-BE-NEXT: lsls r2, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-BE-NEXT: vmovmi.8 q1[7], r2 +; CHECK-BE-NEXT: lsls r2, r1, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.8 q1[8], r2 +; CHECK-BE-NEXT: lsls r2, r1, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-BE-NEXT: vmovmi.8 q1[9], r2 +; CHECK-BE-NEXT: lsls r2, r1, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.8 q1[10], r2 +; CHECK-BE-NEXT: lsls r2, r1, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-BE-NEXT: vmovmi.8 q1[11], r2 +; CHECK-BE-NEXT: lsls r2, r1, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.8 q1[12], r2 +; CHECK-BE-NEXT: lsls r2, r1, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-BE-NEXT: vmovmi.8 q1[13], r2 +; CHECK-BE-NEXT: lsls r2, r1, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-BE-NEXT: vmovmi.8 q1[14], r2 +; CHECK-BE-NEXT: lsls r1, r1, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-BE-NEXT: vmovmi.8 q1[15], r0 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %c = icmp sgt <16 x i8> %a, zeroinitializer + %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> %a) + ret <16 x i8> %l +} + +define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8_preinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: ldrh.w r2, [sp] +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrbne r3, [r0] +; CHECK-LE-NEXT: vmovne.8 q0[0], r3 +; CHECK-LE-NEXT: lsls r3, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #1] +; CHECK-LE-NEXT: vmovmi.8 q0[1], r3 +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #2] +; CHECK-LE-NEXT: vmovmi.8 q0[2], r3 +; CHECK-LE-NEXT: lsls r3, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #3] +; CHECK-LE-NEXT: vmovmi.8 q0[3], r3 +; CHECK-LE-NEXT: lsls r3, r2, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #4] +; CHECK-LE-NEXT: vmovmi.8 q0[4], r3 +; CHECK-LE-NEXT: lsls r3, r2, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #5] +; CHECK-LE-NEXT: vmovmi.8 q0[5], r3 +; CHECK-LE-NEXT: lsls r3, r2, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #6] +; CHECK-LE-NEXT: vmovmi.8 q0[6], r3 +; CHECK-LE-NEXT: lsls r3, r2, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #7] +; CHECK-LE-NEXT: vmovmi.8 q0[7], r3 +; CHECK-LE-NEXT: lsls r3, r2, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #8] +; CHECK-LE-NEXT: vmovmi.8 q0[8], r3 +; CHECK-LE-NEXT: lsls r3, r2, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #9] +; CHECK-LE-NEXT: vmovmi.8 q0[9], r3 +; CHECK-LE-NEXT: lsls r3, r2, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #10] +; CHECK-LE-NEXT: vmovmi.8 q0[10], r3 +; CHECK-LE-NEXT: lsls r3, r2, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #11] +; CHECK-LE-NEXT: vmovmi.8 q0[11], r3 +; CHECK-LE-NEXT: lsls r3, r2, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #12] +; CHECK-LE-NEXT: vmovmi.8 q0[12], r3 +; CHECK-LE-NEXT: lsls r3, r2, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #13] +; CHECK-LE-NEXT: vmovmi.8 q0[13], r3 +; CHECK-LE-NEXT: lsls r3, r2, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r3, [r0, #14] +; CHECK-LE-NEXT: vmovmi.8 q0[14], r3 +; CHECK-LE-NEXT: lsls r2, r2, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #15] +; CHECK-LE-NEXT: vmovmi.8 q0[15], r2 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8_preinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: ldrh.w r2, [sp] +; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrbne r3, [r0] +; CHECK-BE-NEXT: vmovne.8 q0[0], r3 +; CHECK-BE-NEXT: lsls r3, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #1] +; CHECK-BE-NEXT: vmovmi.8 q0[1], r3 +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #2] +; CHECK-BE-NEXT: vmovmi.8 q0[2], r3 +; CHECK-BE-NEXT: lsls r3, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #3] +; CHECK-BE-NEXT: vmovmi.8 q0[3], r3 +; CHECK-BE-NEXT: lsls r3, r2, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #4] +; CHECK-BE-NEXT: vmovmi.8 q0[4], r3 +; CHECK-BE-NEXT: lsls r3, r2, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #5] +; CHECK-BE-NEXT: vmovmi.8 q0[5], r3 +; CHECK-BE-NEXT: lsls r3, r2, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #6] +; CHECK-BE-NEXT: vmovmi.8 q0[6], r3 +; CHECK-BE-NEXT: lsls r3, r2, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #7] +; CHECK-BE-NEXT: vmovmi.8 q0[7], r3 +; CHECK-BE-NEXT: lsls r3, r2, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #8] +; CHECK-BE-NEXT: vmovmi.8 q0[8], r3 +; CHECK-BE-NEXT: lsls r3, r2, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #9] +; CHECK-BE-NEXT: vmovmi.8 q0[9], r3 +; CHECK-BE-NEXT: lsls r3, r2, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #10] +; CHECK-BE-NEXT: vmovmi.8 q0[10], r3 +; CHECK-BE-NEXT: lsls r3, r2, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #11] +; CHECK-BE-NEXT: vmovmi.8 q0[11], r3 +; CHECK-BE-NEXT: lsls r3, r2, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #12] +; CHECK-BE-NEXT: vmovmi.8 q0[12], r3 +; CHECK-BE-NEXT: lsls r3, r2, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #13] +; CHECK-BE-NEXT: vmovmi.8 q0[13], r3 +; CHECK-BE-NEXT: lsls r3, r2, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r3, [r0, #14] +; CHECK-BE-NEXT: vmovmi.8 q0[14], r3 +; CHECK-BE-NEXT: lsls r2, r2, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #15] +; CHECK-BE-NEXT: vmovmi.8 q0[15], r2 +; CHECK-BE-NEXT: vstrb.8 q0, [r1] +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <16 x i8>* + %c = icmp sgt <16 x i8> %a, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 4, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 4 + ret i8* %z +} + +define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8_postinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: ldrh.w r3, [sp] +; CHECK-LE-NEXT: add.w r12, r0, #4 +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrbne r2, [r0] +; CHECK-LE-NEXT: vmovne.8 q0[0], r2 +; CHECK-LE-NEXT: lsls r2, r3, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-LE-NEXT: vmovmi.8 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r3, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-LE-NEXT: vmovmi.8 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r3, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-LE-NEXT: vmovmi.8 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r3, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi.8 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r3, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-LE-NEXT: vmovmi.8 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r3, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-LE-NEXT: vmovmi.8 q0[6], r2 +; CHECK-LE-NEXT: lsls r2, r3, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-LE-NEXT: vmovmi.8 q0[7], r2 +; CHECK-LE-NEXT: lsls r2, r3, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi.8 q0[8], r2 +; CHECK-LE-NEXT: lsls r2, r3, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-LE-NEXT: vmovmi.8 q0[9], r2 +; CHECK-LE-NEXT: lsls r2, r3, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-LE-NEXT: vmovmi.8 q0[10], r2 +; CHECK-LE-NEXT: lsls r2, r3, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-LE-NEXT: vmovmi.8 q0[11], r2 +; CHECK-LE-NEXT: lsls r2, r3, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-LE-NEXT: vmovmi.8 q0[12], r2 +; CHECK-LE-NEXT: lsls r2, r3, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-LE-NEXT: vmovmi.8 q0[13], r2 +; CHECK-LE-NEXT: lsls r2, r3, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-LE-NEXT: vmovmi.8 q0[14], r2 +; CHECK-LE-NEXT: lsls r2, r3, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-LE-NEXT: vmovmi.8 q0[15], r0 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: mov r0, r12 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8_postinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: add.w r12, r0, #4 +; CHECK-BE-NEXT: ldrh.w r3, [sp] +; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrbne r2, [r0] +; CHECK-BE-NEXT: vmovne.8 q0[0], r2 +; CHECK-BE-NEXT: lsls r2, r3, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #1] +; CHECK-BE-NEXT: vmovmi.8 q0[1], r2 +; CHECK-BE-NEXT: lsls r2, r3, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #2] +; CHECK-BE-NEXT: vmovmi.8 q0[2], r2 +; CHECK-BE-NEXT: lsls r2, r3, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #3] +; CHECK-BE-NEXT: vmovmi.8 q0[3], r2 +; CHECK-BE-NEXT: lsls r2, r3, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi.8 q0[4], r2 +; CHECK-BE-NEXT: lsls r2, r3, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #5] +; CHECK-BE-NEXT: vmovmi.8 q0[5], r2 +; CHECK-BE-NEXT: lsls r2, r3, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #6] +; CHECK-BE-NEXT: vmovmi.8 q0[6], r2 +; CHECK-BE-NEXT: lsls r2, r3, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #7] +; CHECK-BE-NEXT: vmovmi.8 q0[7], r2 +; CHECK-BE-NEXT: lsls r2, r3, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi.8 q0[8], r2 +; CHECK-BE-NEXT: lsls r2, r3, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #9] +; CHECK-BE-NEXT: vmovmi.8 q0[9], r2 +; CHECK-BE-NEXT: lsls r2, r3, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #10] +; CHECK-BE-NEXT: vmovmi.8 q0[10], r2 +; CHECK-BE-NEXT: lsls r2, r3, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #11] +; CHECK-BE-NEXT: vmovmi.8 q0[11], r2 +; CHECK-BE-NEXT: lsls r2, r3, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #12] +; CHECK-BE-NEXT: vmovmi.8 q0[12], r2 +; CHECK-BE-NEXT: lsls r2, r3, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #13] +; CHECK-BE-NEXT: vmovmi.8 q0[13], r2 +; CHECK-BE-NEXT: lsls r2, r3, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r2, [r0, #14] +; CHECK-BE-NEXT: vmovmi.8 q0[14], r2 +; CHECK-BE-NEXT: lsls r2, r3, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrbmi r0, [r0, #15] +; CHECK-BE-NEXT: vmovmi.8 q0[15], r0 +; CHECK-BE-NEXT: vstrb.8 q0, [r1] +; CHECK-BE-NEXT: mov r0, r12 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %c = icmp sgt <16 x i8> %a, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 4, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 4 + ret i8* %z +} + + +define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_zero(<4 x float> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4f32_align4_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: beq .LBB17_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: vldr s0, .LCPI17_0 +; CHECK-LE-NEXT: vldr s4, [r0] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vdup.32 q0, r2 +; CHECK-LE-NEXT: vmov.f32 s0, s4 +; CHECK-LE-NEXT: b .LBB17_3 +; CHECK-LE-NEXT: .LBB17_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: .LBB17_3: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s1, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s3, [r0, #12] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .p2align 2 +; CHECK-LE-NEXT: @ %bb.4: +; CHECK-LE-NEXT: .LCPI17_0: +; CHECK-LE-NEXT: .long 0 @ float 0 +; +; CHECK-BE-LABEL: masked_v4f32_align4_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: beq .LBB17_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: vldr s0, .LCPI17_0 +; CHECK-BE-NEXT: vldr s2, [r0] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vdup.32 q1, r2 +; CHECK-BE-NEXT: vmov.f32 s4, s2 +; CHECK-BE-NEXT: b .LBB17_3 +; CHECK-BE-NEXT: .LBB17_2: +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: .LBB17_3: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s5, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s6, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s7, [r0, #12] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .p2align 2 +; CHECK-BE-NEXT: @ %bb.4: +; CHECK-BE-NEXT: .LCPI17_0: +; CHECK-BE-NEXT: .long 0 @ float 0 +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %dest, i32 4, <4 x i1> %c, <4 x float> zeroinitializer) + ret <4 x float> %l +} + +define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_undef(<4 x float> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4f32_align4_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vldrne s0, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s1, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s3, [r0, #12] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_align4_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vldrne s4, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s5, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s6, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s7, [r0, #12] +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %dest, i32 4, <4 x i1> %c, <4 x float> undef) + ret <4 x float> %l +} + +define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4f32_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: ldrne r2, [r0] +; CHECK-LE-NEXT: vmovne s0, r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #4] +; CHECK-LE-NEXT: vmovmi s1, r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r2, [r0, #8] +; CHECK-LE-NEXT: vmovmi s2, r2 +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: ldrmi r0, [r0, #12] +; CHECK-LE-NEXT: vmovmi s3, r0 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: ldrne r2, [r0] +; CHECK-BE-NEXT: vmovne s4, r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #4] +; CHECK-BE-NEXT: vmovmi s5, r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r2, [r0, #8] +; CHECK-BE-NEXT: vmovmi s6, r2 +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: ldrmi r0, [r0, #12] +; CHECK-BE-NEXT: vmovmi s7, r0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %dest, i32 1, <4 x i1> %c, <4 x float> undef) + ret <4 x float> %l +} + +define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_other(<4 x float> *%dest, <4 x i32> %a, <4 x float> %b) { +; CHECK-LE-LABEL: masked_v4f32_align4_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vldrne s4, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s5, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s6, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s7, [r0, #12] +; CHECK-LE-NEXT: vmov q0, q1 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_align4_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q2, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q2, zr +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vldrne s8, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s9, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s10, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s11, [r0, #12] +; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + %l = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %dest, i32 4, <4 x i1> %c, <4 x float> %b) + ret <4 x float> %l +} + +define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4f32_preinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vldrne s0, [r0] +; CHECK-LE-NEXT: lsls r3, r2, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s1, [r0, #4] +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r2, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s3, [r0, #12] +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_preinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vldrne s0, [r0] +; CHECK-BE-NEXT: lsls r3, r2, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s1, [r0, #4] +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r2, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s3, [r0, #12] +; CHECK-BE-NEXT: vstrw.32 q0, [r1] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x float>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4f32_postinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: add.w r12, r0, #4 +; CHECK-LE-NEXT: ldrb.w r3, [sp] +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vldrne s0, [r0] +; CHECK-LE-NEXT: lsls r2, r3, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s1, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r3, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r3, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi s3, [r0, #12] +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: mov r0, r12 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_postinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: add.w r12, r0, #4 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r3, [sp] +; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vldrne s0, [r0] +; CHECK-BE-NEXT: lsls r2, r3, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s1, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r3, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r3, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi s3, [r0, #12] +; CHECK-BE-NEXT: vstrw.32 q0, [r1] +; CHECK-BE-NEXT: mov r0, r12 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + + +define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_zero(<8 x half> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8f16_align4_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: beq .LBB23_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: vldr.16 s0, .LCPI23_0 +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vldr.16 s0, [r0] +; CHECK-LE-NEXT: vmov r3, s0 +; CHECK-LE-NEXT: vdup.16 q0, r2 +; CHECK-LE-NEXT: vmov.16 q0[0], r3 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB23_3 +; CHECK-LE-NEXT: b .LBB23_4 +; CHECK-LE-NEXT: .LBB23_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB23_4 +; CHECK-LE-NEXT: .LBB23_3: @ %cond.load1 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #2] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[1], r2 +; CHECK-LE-NEXT: .LBB23_4: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB23_11 +; CHECK-LE-NEXT: @ %bb.5: @ %else5 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB23_12 +; CHECK-LE-NEXT: .LBB23_6: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB23_13 +; CHECK-LE-NEXT: .LBB23_7: @ %else11 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB23_14 +; CHECK-LE-NEXT: .LBB23_8: @ %else14 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB23_15 +; CHECK-LE-NEXT: .LBB23_9: @ %else17 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB23_16 +; CHECK-LE-NEXT: .LBB23_10: @ %else20 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB23_11: @ %cond.load4 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #4] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB23_6 +; CHECK-LE-NEXT: .LBB23_12: @ %cond.load7 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #6] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB23_7 +; CHECK-LE-NEXT: .LBB23_13: @ %cond.load10 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #8] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB23_8 +; CHECK-LE-NEXT: .LBB23_14: @ %cond.load13 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #10] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB23_9 +; CHECK-LE-NEXT: .LBB23_15: @ %cond.load16 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #12] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB23_10 +; CHECK-LE-NEXT: .LBB23_16: @ %cond.load19 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #14] +; CHECK-LE-NEXT: vmov r0, s4 +; CHECK-LE-NEXT: vmov.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .p2align 1 +; CHECK-LE-NEXT: @ %bb.17: +; CHECK-LE-NEXT: .LCPI23_0: +; CHECK-LE-NEXT: .short 0 @ half 0 +; +; CHECK-BE-LABEL: masked_v8f16_align4_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: beq .LBB23_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: vldr.16 s0, .LCPI23_0 +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vldr.16 s0, [r0] +; CHECK-BE-NEXT: vdup.16 q1, r2 +; CHECK-BE-NEXT: vmov r3, s0 +; CHECK-BE-NEXT: vmov.16 q1[0], r3 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB23_3 +; CHECK-BE-NEXT: b .LBB23_4 +; CHECK-BE-NEXT: .LBB23_2: +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vrev32.16 q1, q0 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB23_4 +; CHECK-BE-NEXT: .LBB23_3: @ %cond.load1 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #2] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[1], r2 +; CHECK-BE-NEXT: .LBB23_4: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB23_12 +; CHECK-BE-NEXT: @ %bb.5: @ %else5 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB23_13 +; CHECK-BE-NEXT: .LBB23_6: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB23_14 +; CHECK-BE-NEXT: .LBB23_7: @ %else11 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB23_15 +; CHECK-BE-NEXT: .LBB23_8: @ %else14 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB23_16 +; CHECK-BE-NEXT: .LBB23_9: @ %else17 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB23_11 +; CHECK-BE-NEXT: .LBB23_10: @ %cond.load19 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #14] +; CHECK-BE-NEXT: vmov r0, s0 +; CHECK-BE-NEXT: vmov.16 q1[7], r0 +; CHECK-BE-NEXT: .LBB23_11: @ %else20 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB23_12: @ %cond.load4 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #4] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB23_6 +; CHECK-BE-NEXT: .LBB23_13: @ %cond.load7 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #6] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB23_7 +; CHECK-BE-NEXT: .LBB23_14: @ %cond.load10 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #8] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB23_8 +; CHECK-BE-NEXT: .LBB23_15: @ %cond.load13 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #10] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB23_9 +; CHECK-BE-NEXT: .LBB23_16: @ %cond.load16 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #12] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB23_10 +; CHECK-BE-NEXT: b .LBB23_11 +; CHECK-BE-NEXT: .p2align 1 +; CHECK-BE-NEXT: @ %bb.17: +; CHECK-BE-NEXT: .LCPI23_0: +; CHECK-BE-NEXT: .short 0 @ half 0 +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> zeroinitializer) + ret <8 x half> %l +} + +define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_undef(<8 x half> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8f16_align4_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB24_9 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB24_10 +; CHECK-LE-NEXT: .LBB24_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB24_11 +; CHECK-LE-NEXT: .LBB24_3: @ %else5 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB24_12 +; CHECK-LE-NEXT: .LBB24_4: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB24_13 +; CHECK-LE-NEXT: .LBB24_5: @ %else11 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB24_14 +; CHECK-LE-NEXT: .LBB24_6: @ %else14 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB24_15 +; CHECK-LE-NEXT: .LBB24_7: @ %else17 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB24_16 +; CHECK-LE-NEXT: .LBB24_8: @ %else20 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB24_9: @ %cond.load +; CHECK-LE-NEXT: vldr.16 s0, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB24_2 +; CHECK-LE-NEXT: .LBB24_10: @ %cond.load1 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #2] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB24_3 +; CHECK-LE-NEXT: .LBB24_11: @ %cond.load4 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #4] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB24_4 +; CHECK-LE-NEXT: .LBB24_12: @ %cond.load7 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #6] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB24_5 +; CHECK-LE-NEXT: .LBB24_13: @ %cond.load10 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #8] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB24_6 +; CHECK-LE-NEXT: .LBB24_14: @ %cond.load13 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #10] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB24_7 +; CHECK-LE-NEXT: .LBB24_15: @ %cond.load16 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #12] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB24_8 +; CHECK-LE-NEXT: .LBB24_16: @ %cond.load19 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #14] +; CHECK-LE-NEXT: vmov r0, s4 +; CHECK-LE-NEXT: vmov.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8f16_align4_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB24_10 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB24_11 +; CHECK-BE-NEXT: .LBB24_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB24_12 +; CHECK-BE-NEXT: .LBB24_3: @ %else5 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB24_13 +; CHECK-BE-NEXT: .LBB24_4: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB24_14 +; CHECK-BE-NEXT: .LBB24_5: @ %else11 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB24_15 +; CHECK-BE-NEXT: .LBB24_6: @ %else14 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB24_16 +; CHECK-BE-NEXT: .LBB24_7: @ %else17 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB24_9 +; CHECK-BE-NEXT: .LBB24_8: @ %cond.load19 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #14] +; CHECK-BE-NEXT: vmov r0, s0 +; CHECK-BE-NEXT: vmov.16 q1[7], r0 +; CHECK-BE-NEXT: .LBB24_9: @ %else20 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB24_10: @ %cond.load +; CHECK-BE-NEXT: vldr.16 s4, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB24_2 +; CHECK-BE-NEXT: .LBB24_11: @ %cond.load1 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #2] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB24_3 +; CHECK-BE-NEXT: .LBB24_12: @ %cond.load4 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #4] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB24_4 +; CHECK-BE-NEXT: .LBB24_13: @ %cond.load7 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #6] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB24_5 +; CHECK-BE-NEXT: .LBB24_14: @ %cond.load10 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #8] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB24_6 +; CHECK-BE-NEXT: .LBB24_15: @ %cond.load13 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #10] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB24_7 +; CHECK-BE-NEXT: .LBB24_16: @ %cond.load16 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #12] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB24_8 +; CHECK-BE-NEXT: b .LBB24_9 +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> undef) + ret <8 x half> %l +} + +define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8f16_align1_undef: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: add r1, sp, #32 +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r1, [sp, #32] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB25_9 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB25_10 +; CHECK-LE-NEXT: .LBB25_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB25_11 +; CHECK-LE-NEXT: .LBB25_3: @ %else5 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB25_12 +; CHECK-LE-NEXT: .LBB25_4: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB25_13 +; CHECK-LE-NEXT: .LBB25_5: @ %else11 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB25_14 +; CHECK-LE-NEXT: .LBB25_6: @ %else14 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB25_15 +; CHECK-LE-NEXT: .LBB25_7: @ %else17 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB25_16 +; CHECK-LE-NEXT: .LBB25_8: @ %else20 +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB25_9: @ %cond.load +; CHECK-LE-NEXT: ldrh r2, [r0] +; CHECK-LE-NEXT: strh.w r2, [sp, #28] +; CHECK-LE-NEXT: vldr.16 s0, [sp, #28] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB25_2 +; CHECK-LE-NEXT: .LBB25_10: @ %cond.load1 +; CHECK-LE-NEXT: ldrh r2, [r0, #2] +; CHECK-LE-NEXT: strh.w r2, [sp, #24] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #24] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB25_3 +; CHECK-LE-NEXT: .LBB25_11: @ %cond.load4 +; CHECK-LE-NEXT: ldrh r2, [r0, #4] +; CHECK-LE-NEXT: strh.w r2, [sp, #20] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #20] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB25_4 +; CHECK-LE-NEXT: .LBB25_12: @ %cond.load7 +; CHECK-LE-NEXT: ldrh r2, [r0, #6] +; CHECK-LE-NEXT: strh.w r2, [sp, #16] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #16] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB25_5 +; CHECK-LE-NEXT: .LBB25_13: @ %cond.load10 +; CHECK-LE-NEXT: ldrh r2, [r0, #8] +; CHECK-LE-NEXT: strh.w r2, [sp, #12] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #12] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB25_6 +; CHECK-LE-NEXT: .LBB25_14: @ %cond.load13 +; CHECK-LE-NEXT: ldrh r2, [r0, #10] +; CHECK-LE-NEXT: strh.w r2, [sp, #8] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #8] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB25_7 +; CHECK-LE-NEXT: .LBB25_15: @ %cond.load16 +; CHECK-LE-NEXT: ldrh r2, [r0, #12] +; CHECK-LE-NEXT: strh.w r2, [sp, #4] +; CHECK-LE-NEXT: vldr.16 s4, [sp, #4] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB25_8 +; CHECK-LE-NEXT: .LBB25_16: @ %cond.load19 +; CHECK-LE-NEXT: ldrh r0, [r0, #14] +; CHECK-LE-NEXT: strh.w r0, [sp] +; CHECK-LE-NEXT: vldr.16 s4, [sp] +; CHECK-LE-NEXT: vmov r0, s4 +; CHECK-LE-NEXT: vmov.16 q0[7], r0 +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8f16_align1_undef: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: add r1, sp, #32 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp, #32] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB25_10 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB25_11 +; CHECK-BE-NEXT: .LBB25_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB25_12 +; CHECK-BE-NEXT: .LBB25_3: @ %else5 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB25_13 +; CHECK-BE-NEXT: .LBB25_4: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB25_14 +; CHECK-BE-NEXT: .LBB25_5: @ %else11 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB25_15 +; CHECK-BE-NEXT: .LBB25_6: @ %else14 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB25_16 +; CHECK-BE-NEXT: .LBB25_7: @ %else17 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB25_9 +; CHECK-BE-NEXT: .LBB25_8: @ %cond.load19 +; CHECK-BE-NEXT: ldrh r0, [r0, #14] +; CHECK-BE-NEXT: strh.w r0, [sp] +; CHECK-BE-NEXT: vldr.16 s0, [sp] +; CHECK-BE-NEXT: vmov r0, s0 +; CHECK-BE-NEXT: vmov.16 q1[7], r0 +; CHECK-BE-NEXT: .LBB25_9: @ %else20 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB25_10: @ %cond.load +; CHECK-BE-NEXT: ldrh r2, [r0] +; CHECK-BE-NEXT: strh.w r2, [sp, #28] +; CHECK-BE-NEXT: vldr.16 s4, [sp, #28] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB25_2 +; CHECK-BE-NEXT: .LBB25_11: @ %cond.load1 +; CHECK-BE-NEXT: ldrh r2, [r0, #2] +; CHECK-BE-NEXT: strh.w r2, [sp, #24] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #24] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB25_3 +; CHECK-BE-NEXT: .LBB25_12: @ %cond.load4 +; CHECK-BE-NEXT: ldrh r2, [r0, #4] +; CHECK-BE-NEXT: strh.w r2, [sp, #20] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #20] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB25_4 +; CHECK-BE-NEXT: .LBB25_13: @ %cond.load7 +; CHECK-BE-NEXT: ldrh r2, [r0, #6] +; CHECK-BE-NEXT: strh.w r2, [sp, #16] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #16] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB25_5 +; CHECK-BE-NEXT: .LBB25_14: @ %cond.load10 +; CHECK-BE-NEXT: ldrh r2, [r0, #8] +; CHECK-BE-NEXT: strh.w r2, [sp, #12] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #12] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB25_6 +; CHECK-BE-NEXT: .LBB25_15: @ %cond.load13 +; CHECK-BE-NEXT: ldrh r2, [r0, #10] +; CHECK-BE-NEXT: strh.w r2, [sp, #8] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #8] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB25_7 +; CHECK-BE-NEXT: .LBB25_16: @ %cond.load16 +; CHECK-BE-NEXT: ldrh r2, [r0, #12] +; CHECK-BE-NEXT: strh.w r2, [sp, #4] +; CHECK-BE-NEXT: vldr.16 s0, [sp, #4] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q1[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB25_8 +; CHECK-BE-NEXT: b .LBB25_9 +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 1, <8 x i1> %c, <8 x half> undef) + ret <8 x half> %l +} + +define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_other(<8 x half> *%dest, <8 x i16> %a, <8 x half> %b) { +; CHECK-LE-LABEL: masked_v8f16_align4_other: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB26_10 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB26_11 +; CHECK-LE-NEXT: .LBB26_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB26_12 +; CHECK-LE-NEXT: .LBB26_3: @ %else5 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB26_13 +; CHECK-LE-NEXT: .LBB26_4: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB26_14 +; CHECK-LE-NEXT: .LBB26_5: @ %else11 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB26_15 +; CHECK-LE-NEXT: .LBB26_6: @ %else14 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB26_16 +; CHECK-LE-NEXT: .LBB26_7: @ %else17 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB26_9 +; CHECK-LE-NEXT: .LBB26_8: @ %cond.load19 +; CHECK-LE-NEXT: vldr.16 s0, [r0, #14] +; CHECK-LE-NEXT: vmov r0, s0 +; CHECK-LE-NEXT: vmov.16 q1[7], r0 +; CHECK-LE-NEXT: .LBB26_9: @ %else20 +; CHECK-LE-NEXT: vmov q0, q1 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB26_10: @ %cond.load +; CHECK-LE-NEXT: vldr.16 s0, [r0] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov.16 q1[0], r2 +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB26_2 +; CHECK-LE-NEXT: .LBB26_11: @ %cond.load1 +; CHECK-LE-NEXT: vldr.16 s0, [r0, #2] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov.16 q1[1], r2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB26_3 +; CHECK-LE-NEXT: .LBB26_12: @ %cond.load4 +; CHECK-LE-NEXT: vldr.16 s0, [r0, #4] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov.16 q1[2], r2 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB26_4 +; CHECK-LE-NEXT: .LBB26_13: @ %cond.load7 +; CHECK-LE-NEXT: vldr.16 s0, [r0, #6] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov.16 q1[3], r2 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB26_5 +; CHECK-LE-NEXT: .LBB26_14: @ %cond.load10 +; CHECK-LE-NEXT: vldr.16 s0, [r0, #8] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov.16 q1[4], r2 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB26_6 +; CHECK-LE-NEXT: .LBB26_15: @ %cond.load13 +; CHECK-LE-NEXT: vldr.16 s0, [r0, #10] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov.16 q1[5], r2 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB26_7 +; CHECK-LE-NEXT: .LBB26_16: @ %cond.load16 +; CHECK-LE-NEXT: vldr.16 s0, [r0, #12] +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov.16 q1[6], r2 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB26_8 +; CHECK-LE-NEXT: b .LBB26_9 +; +; CHECK-BE-LABEL: masked_v8f16_align4_other: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q2, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q2, zr +; CHECK-BE-NEXT: vrev64.16 q2, q1 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB26_10 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB26_11 +; CHECK-BE-NEXT: .LBB26_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB26_12 +; CHECK-BE-NEXT: .LBB26_3: @ %else5 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB26_13 +; CHECK-BE-NEXT: .LBB26_4: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB26_14 +; CHECK-BE-NEXT: .LBB26_5: @ %else11 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB26_15 +; CHECK-BE-NEXT: .LBB26_6: @ %else14 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB26_16 +; CHECK-BE-NEXT: .LBB26_7: @ %else17 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB26_9 +; CHECK-BE-NEXT: .LBB26_8: @ %cond.load19 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #14] +; CHECK-BE-NEXT: vmov r0, s0 +; CHECK-BE-NEXT: vmov.16 q2[7], r0 +; CHECK-BE-NEXT: .LBB26_9: @ %else20 +; CHECK-BE-NEXT: vrev64.16 q0, q2 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB26_10: @ %cond.load +; CHECK-BE-NEXT: vldr.16 s0, [r0] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q2[0], r2 +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB26_2 +; CHECK-BE-NEXT: .LBB26_11: @ %cond.load1 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #2] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q2[1], r2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB26_3 +; CHECK-BE-NEXT: .LBB26_12: @ %cond.load4 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #4] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q2[2], r2 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB26_4 +; CHECK-BE-NEXT: .LBB26_13: @ %cond.load7 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #6] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q2[3], r2 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB26_5 +; CHECK-BE-NEXT: .LBB26_14: @ %cond.load10 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #8] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q2[4], r2 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB26_6 +; CHECK-BE-NEXT: .LBB26_15: @ %cond.load13 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #10] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q2[5], r2 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB26_7 +; CHECK-BE-NEXT: .LBB26_16: @ %cond.load16 +; CHECK-BE-NEXT: vldr.16 s0, [r0, #12] +; CHECK-BE-NEXT: vmov r2, s0 +; CHECK-BE-NEXT: vmov.16 q2[6], r2 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB26_8 +; CHECK-BE-NEXT: b .LBB26_9 +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> %b) + ret <8 x half> %l +} + +define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8f16_preinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: bne .LBB27_10 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r3, r2, #30 +; CHECK-LE-NEXT: bmi .LBB27_11 +; CHECK-LE-NEXT: .LBB27_2: @ %else2 +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: bmi .LBB27_12 +; CHECK-LE-NEXT: .LBB27_3: @ %else5 +; CHECK-LE-NEXT: lsls r3, r2, #28 +; CHECK-LE-NEXT: bmi .LBB27_13 +; CHECK-LE-NEXT: .LBB27_4: @ %else8 +; CHECK-LE-NEXT: lsls r3, r2, #27 +; CHECK-LE-NEXT: bmi .LBB27_14 +; CHECK-LE-NEXT: .LBB27_5: @ %else11 +; CHECK-LE-NEXT: lsls r3, r2, #26 +; CHECK-LE-NEXT: bmi .LBB27_15 +; CHECK-LE-NEXT: .LBB27_6: @ %else14 +; CHECK-LE-NEXT: lsls r3, r2, #25 +; CHECK-LE-NEXT: bmi .LBB27_16 +; CHECK-LE-NEXT: .LBB27_7: @ %else17 +; CHECK-LE-NEXT: lsls r2, r2, #24 +; CHECK-LE-NEXT: bpl .LBB27_9 +; CHECK-LE-NEXT: .LBB27_8: @ %cond.load19 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #14] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[7], r2 +; CHECK-LE-NEXT: .LBB27_9: @ %else20 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB27_10: @ %cond.load +; CHECK-LE-NEXT: vldr.16 s0, [r0] +; CHECK-LE-NEXT: lsls r3, r2, #30 +; CHECK-LE-NEXT: bpl .LBB27_2 +; CHECK-LE-NEXT: .LBB27_11: @ %cond.load1 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #2] +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov.16 q0[1], r3 +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: bpl .LBB27_3 +; CHECK-LE-NEXT: .LBB27_12: @ %cond.load4 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #4] +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov.16 q0[2], r3 +; CHECK-LE-NEXT: lsls r3, r2, #28 +; CHECK-LE-NEXT: bpl .LBB27_4 +; CHECK-LE-NEXT: .LBB27_13: @ %cond.load7 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #6] +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov.16 q0[3], r3 +; CHECK-LE-NEXT: lsls r3, r2, #27 +; CHECK-LE-NEXT: bpl .LBB27_5 +; CHECK-LE-NEXT: .LBB27_14: @ %cond.load10 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #8] +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov.16 q0[4], r3 +; CHECK-LE-NEXT: lsls r3, r2, #26 +; CHECK-LE-NEXT: bpl .LBB27_6 +; CHECK-LE-NEXT: .LBB27_15: @ %cond.load13 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #10] +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov.16 q0[5], r3 +; CHECK-LE-NEXT: lsls r3, r2, #25 +; CHECK-LE-NEXT: bpl .LBB27_7 +; CHECK-LE-NEXT: .LBB27_16: @ %cond.load16 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #12] +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov.16 q0[6], r3 +; CHECK-LE-NEXT: lsls r2, r2, #24 +; CHECK-LE-NEXT: bmi .LBB27_8 +; CHECK-LE-NEXT: b .LBB27_9 +; +; CHECK-BE-LABEL: masked_v8f16_preinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: bne .LBB27_10 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r3, r2, #30 +; CHECK-BE-NEXT: bmi .LBB27_11 +; CHECK-BE-NEXT: .LBB27_2: @ %else2 +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: bmi .LBB27_12 +; CHECK-BE-NEXT: .LBB27_3: @ %else5 +; CHECK-BE-NEXT: lsls r3, r2, #28 +; CHECK-BE-NEXT: bmi .LBB27_13 +; CHECK-BE-NEXT: .LBB27_4: @ %else8 +; CHECK-BE-NEXT: lsls r3, r2, #27 +; CHECK-BE-NEXT: bmi .LBB27_14 +; CHECK-BE-NEXT: .LBB27_5: @ %else11 +; CHECK-BE-NEXT: lsls r3, r2, #26 +; CHECK-BE-NEXT: bmi .LBB27_15 +; CHECK-BE-NEXT: .LBB27_6: @ %else14 +; CHECK-BE-NEXT: lsls r3, r2, #25 +; CHECK-BE-NEXT: bmi .LBB27_16 +; CHECK-BE-NEXT: .LBB27_7: @ %else17 +; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: bpl .LBB27_9 +; CHECK-BE-NEXT: .LBB27_8: @ %cond.load19 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #14] +; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov.16 q0[7], r2 +; CHECK-BE-NEXT: .LBB27_9: @ %else20 +; CHECK-BE-NEXT: vstrh.16 q0, [r1] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB27_10: @ %cond.load +; CHECK-BE-NEXT: vldr.16 s0, [r0] +; CHECK-BE-NEXT: lsls r3, r2, #30 +; CHECK-BE-NEXT: bpl .LBB27_2 +; CHECK-BE-NEXT: .LBB27_11: @ %cond.load1 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #2] +; CHECK-BE-NEXT: vmov r3, s4 +; CHECK-BE-NEXT: vmov.16 q0[1], r3 +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: bpl .LBB27_3 +; CHECK-BE-NEXT: .LBB27_12: @ %cond.load4 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #4] +; CHECK-BE-NEXT: vmov r3, s4 +; CHECK-BE-NEXT: vmov.16 q0[2], r3 +; CHECK-BE-NEXT: lsls r3, r2, #28 +; CHECK-BE-NEXT: bpl .LBB27_4 +; CHECK-BE-NEXT: .LBB27_13: @ %cond.load7 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #6] +; CHECK-BE-NEXT: vmov r3, s4 +; CHECK-BE-NEXT: vmov.16 q0[3], r3 +; CHECK-BE-NEXT: lsls r3, r2, #27 +; CHECK-BE-NEXT: bpl .LBB27_5 +; CHECK-BE-NEXT: .LBB27_14: @ %cond.load10 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #8] +; CHECK-BE-NEXT: vmov r3, s4 +; CHECK-BE-NEXT: vmov.16 q0[4], r3 +; CHECK-BE-NEXT: lsls r3, r2, #26 +; CHECK-BE-NEXT: bpl .LBB27_6 +; CHECK-BE-NEXT: .LBB27_15: @ %cond.load13 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #10] +; CHECK-BE-NEXT: vmov r3, s4 +; CHECK-BE-NEXT: vmov.16 q0[5], r3 +; CHECK-BE-NEXT: lsls r3, r2, #25 +; CHECK-BE-NEXT: bpl .LBB27_7 +; CHECK-BE-NEXT: .LBB27_16: @ %cond.load16 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #12] +; CHECK-BE-NEXT: vmov r3, s4 +; CHECK-BE-NEXT: vmov.16 q0[6], r3 +; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: bmi .LBB27_8 +; CHECK-BE-NEXT: b .LBB27_9 +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x half>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 4, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 4 + ret i8* %z +} + +define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8f16_postinc: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrb.w r3, [sp] +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: bne .LBB28_12 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r3, #30 +; CHECK-LE-NEXT: bmi .LBB28_13 +; CHECK-LE-NEXT: .LBB28_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r3, #29 +; CHECK-LE-NEXT: bmi .LBB28_14 +; CHECK-LE-NEXT: .LBB28_3: @ %else5 +; CHECK-LE-NEXT: lsls r2, r3, #28 +; CHECK-LE-NEXT: bmi .LBB28_15 +; CHECK-LE-NEXT: .LBB28_4: @ %else8 +; CHECK-LE-NEXT: lsls r2, r3, #27 +; CHECK-LE-NEXT: bmi .LBB28_16 +; CHECK-LE-NEXT: .LBB28_5: @ %else11 +; CHECK-LE-NEXT: lsls r2, r3, #26 +; CHECK-LE-NEXT: bpl .LBB28_7 +; CHECK-LE-NEXT: .LBB28_6: @ %cond.load13 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #10] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[5], r2 +; CHECK-LE-NEXT: .LBB28_7: @ %else14 +; CHECK-LE-NEXT: add.w r12, r0, #4 +; CHECK-LE-NEXT: lsls r2, r3, #25 +; CHECK-LE-NEXT: bpl .LBB28_9 +; CHECK-LE-NEXT: @ %bb.8: @ %cond.load16 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #12] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[6], r2 +; CHECK-LE-NEXT: .LBB28_9: @ %else17 +; CHECK-LE-NEXT: lsls r2, r3, #24 +; CHECK-LE-NEXT: bpl .LBB28_11 +; CHECK-LE-NEXT: @ %bb.10: @ %cond.load19 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #14] +; CHECK-LE-NEXT: vmov r0, s4 +; CHECK-LE-NEXT: vmov.16 q0[7], r0 +; CHECK-LE-NEXT: .LBB28_11: @ %else20 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: mov r0, r12 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB28_12: @ %cond.load +; CHECK-LE-NEXT: vldr.16 s0, [r0] +; CHECK-LE-NEXT: lsls r2, r3, #30 +; CHECK-LE-NEXT: bpl .LBB28_2 +; CHECK-LE-NEXT: .LBB28_13: @ %cond.load1 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #2] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[1], r2 +; CHECK-LE-NEXT: lsls r2, r3, #29 +; CHECK-LE-NEXT: bpl .LBB28_3 +; CHECK-LE-NEXT: .LBB28_14: @ %cond.load4 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #4] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[2], r2 +; CHECK-LE-NEXT: lsls r2, r3, #28 +; CHECK-LE-NEXT: bpl .LBB28_4 +; CHECK-LE-NEXT: .LBB28_15: @ %cond.load7 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #6] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[3], r2 +; CHECK-LE-NEXT: lsls r2, r3, #27 +; CHECK-LE-NEXT: bpl .LBB28_5 +; CHECK-LE-NEXT: .LBB28_16: @ %cond.load10 +; CHECK-LE-NEXT: vldr.16 s4, [r0, #8] +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov.16 q0[4], r2 +; CHECK-LE-NEXT: lsls r2, r3, #26 +; CHECK-LE-NEXT: bmi .LBB28_6 +; CHECK-LE-NEXT: b .LBB28_7 +; +; CHECK-BE-LABEL: masked_v8f16_postinc: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r3, [sp] +; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: bne .LBB28_12 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r3, #30 +; CHECK-BE-NEXT: bmi .LBB28_13 +; CHECK-BE-NEXT: .LBB28_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r3, #29 +; CHECK-BE-NEXT: bmi .LBB28_14 +; CHECK-BE-NEXT: .LBB28_3: @ %else5 +; CHECK-BE-NEXT: lsls r2, r3, #28 +; CHECK-BE-NEXT: bmi .LBB28_15 +; CHECK-BE-NEXT: .LBB28_4: @ %else8 +; CHECK-BE-NEXT: lsls r2, r3, #27 +; CHECK-BE-NEXT: bmi .LBB28_16 +; CHECK-BE-NEXT: .LBB28_5: @ %else11 +; CHECK-BE-NEXT: lsls r2, r3, #26 +; CHECK-BE-NEXT: bpl .LBB28_7 +; CHECK-BE-NEXT: .LBB28_6: @ %cond.load13 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #10] +; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov.16 q0[5], r2 +; CHECK-BE-NEXT: .LBB28_7: @ %else14 +; CHECK-BE-NEXT: add.w r12, r0, #4 +; CHECK-BE-NEXT: lsls r2, r3, #25 +; CHECK-BE-NEXT: bpl .LBB28_9 +; CHECK-BE-NEXT: @ %bb.8: @ %cond.load16 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #12] +; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov.16 q0[6], r2 +; CHECK-BE-NEXT: .LBB28_9: @ %else17 +; CHECK-BE-NEXT: lsls r2, r3, #24 +; CHECK-BE-NEXT: bpl .LBB28_11 +; CHECK-BE-NEXT: @ %bb.10: @ %cond.load19 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #14] +; CHECK-BE-NEXT: vmov r0, s4 +; CHECK-BE-NEXT: vmov.16 q0[7], r0 +; CHECK-BE-NEXT: .LBB28_11: @ %else20 +; CHECK-BE-NEXT: vstrh.16 q0, [r1] +; CHECK-BE-NEXT: mov r0, r12 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB28_12: @ %cond.load +; CHECK-BE-NEXT: vldr.16 s0, [r0] +; CHECK-BE-NEXT: lsls r2, r3, #30 +; CHECK-BE-NEXT: bpl .LBB28_2 +; CHECK-BE-NEXT: .LBB28_13: @ %cond.load1 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #2] +; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov.16 q0[1], r2 +; CHECK-BE-NEXT: lsls r2, r3, #29 +; CHECK-BE-NEXT: bpl .LBB28_3 +; CHECK-BE-NEXT: .LBB28_14: @ %cond.load4 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #4] +; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov.16 q0[2], r2 +; CHECK-BE-NEXT: lsls r2, r3, #28 +; CHECK-BE-NEXT: bpl .LBB28_4 +; CHECK-BE-NEXT: .LBB28_15: @ %cond.load7 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #6] +; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov.16 q0[3], r2 +; CHECK-BE-NEXT: lsls r2, r3, #27 +; CHECK-BE-NEXT: bpl .LBB28_5 +; CHECK-BE-NEXT: .LBB28_16: @ %cond.load10 +; CHECK-BE-NEXT: vldr.16 s4, [r0, #8] +; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov.16 q0[4], r2 +; CHECK-BE-NEXT: lsls r2, r3, #26 +; CHECK-BE-NEXT: bmi .LBB28_6 +; CHECK-BE-NEXT: b .LBB28_7 +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 4, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 4 + ret i8* %z +} + + +define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(<2 x i64> *%dest, <2 x i64> %a) { +; CHECK-LE-LABEL: masked_v2i64_align4_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vmov r3, s0 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov r1, s1 +; CHECK-LE-NEXT: vmov r12, s3 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: vmov r3, s2 +; CHECK-LE-NEXT: sbcs.w r1, r2, r1 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r2, #1 +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r2, #1 +; CHECK-LE-NEXT: bfi r2, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r2, #3 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: beq .LBB29_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: vldr d1, .LCPI29_0 +; CHECK-LE-NEXT: vldr d0, [r0] +; CHECK-LE-NEXT: b .LBB29_3 +; CHECK-LE-NEXT: .LBB29_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: .LBB29_3: @ %else +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi d1, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .p2align 3 +; CHECK-LE-NEXT: @ %bb.4: +; CHECK-LE-NEXT: .LCPI29_0: +; CHECK-LE-NEXT: .long 0 @ double 0 +; CHECK-LE-NEXT: .long 0 +; +; CHECK-BE-LABEL: masked_v2i64_align4_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: vmov r3, s7 +; CHECK-BE-NEXT: vmov r1, s6 +; CHECK-BE-NEXT: vmov r12, s4 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: vmov r3, s5 +; CHECK-BE-NEXT: sbcs.w r1, r2, r1 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: sbcs.w r3, r2, r12 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r2, #1 +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r2, #1 +; CHECK-BE-NEXT: bfi r2, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r2, #3 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: beq .LBB29_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: vldr d1, .LCPI29_0 +; CHECK-BE-NEXT: vldr d0, [r0] +; CHECK-BE-NEXT: b .LBB29_3 +; CHECK-BE-NEXT: .LBB29_2: +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: .LBB29_3: @ %else +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi d1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .p2align 3 +; CHECK-BE-NEXT: @ %bb.4: +; CHECK-BE-NEXT: .LCPI29_0: +; CHECK-BE-NEXT: .long 0 @ double 0 +; CHECK-BE-NEXT: .long 0 +entry: + %c = icmp sgt <2 x i64> %a, zeroinitializer + %l = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %dest, i32 8, <2 x i1> %c, <2 x i64> zeroinitializer) + ret <2 x i64> %l +} + +define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(<2 x double> *%dest, <2 x double> %a, <2 x i64> %b) { +; CHECK-LE-LABEL: masked_v2f64_align4_zero: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov r1, s5 +; CHECK-LE-NEXT: vmov r12, s7 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: vmov r3, s6 +; CHECK-LE-NEXT: sbcs.w r1, r2, r1 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r2, #1 +; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r2, #1 +; CHECK-LE-NEXT: bfi r2, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r2, #3 +; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: beq .LBB30_2 +; CHECK-LE-NEXT: @ %bb.1: @ %cond.load +; CHECK-LE-NEXT: vldr d1, .LCPI30_0 +; CHECK-LE-NEXT: vldr d0, [r0] +; CHECK-LE-NEXT: b .LBB30_3 +; CHECK-LE-NEXT: .LBB30_2: +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: .LBB30_3: @ %else +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vldrmi d1, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .p2align 3 +; CHECK-LE-NEXT: @ %bb.4: +; CHECK-LE-NEXT: .LCPI30_0: +; CHECK-LE-NEXT: .long 0 @ double 0 +; CHECK-LE-NEXT: .long 0 +; +; CHECK-BE-LABEL: masked_v2f64_align4_zero: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: vmov r3, s3 +; CHECK-BE-NEXT: vmov r1, s2 +; CHECK-BE-NEXT: vmov r12, s0 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: vmov r3, s1 +; CHECK-BE-NEXT: sbcs.w r1, r2, r1 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: sbcs.w r3, r2, r12 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r2, #1 +; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r2, #1 +; CHECK-BE-NEXT: bfi r2, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r2, #3 +; CHECK-BE-NEXT: lsls r2, r2, #31 +; CHECK-BE-NEXT: beq .LBB30_2 +; CHECK-BE-NEXT: @ %bb.1: @ %cond.load +; CHECK-BE-NEXT: vldr d1, .LCPI30_0 +; CHECK-BE-NEXT: vldr d0, [r0] +; CHECK-BE-NEXT: b .LBB30_3 +; CHECK-BE-NEXT: .LBB30_2: +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: .LBB30_3: @ %else +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vldrmi d1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .p2align 3 +; CHECK-BE-NEXT: @ %bb.4: +; CHECK-BE-NEXT: .LCPI30_0: +; CHECK-BE-NEXT: .long 0 @ double 0 +; CHECK-BE-NEXT: .long 0 +entry: + %c = icmp sgt <2 x i64> %b, zeroinitializer + %l = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %dest, i32 8, <2 x i1> %c, <2 x double> zeroinitializer) + ret <2 x double> %l +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) +declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll new file mode 100644 index 0000000000000..3ff1dec821dff --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -0,0 +1,2292 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define arm_aapcs_vfpcc void @masked_v4i32(<4 x i32> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r2, s0 +; CHECK-LE-NEXT: strne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s1 +; CHECK-LE-NEXT: strmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s2 +; CHECK-LE-NEXT: strmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s3 +; CHECK-LE-NEXT: strmi r1, [r0, #12] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r2, s4 +; CHECK-BE-NEXT: strne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: strmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: strmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s7 +; CHECK-BE-NEXT: strmi r1, [r0, #12] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %a, <4 x i32>* %dest, i32 4, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r2, s0 +; CHECK-LE-NEXT: strne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s1 +; CHECK-LE-NEXT: strmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s2 +; CHECK-LE-NEXT: strmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s3 +; CHECK-LE-NEXT: strmi r1, [r0, #12] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r2, s4 +; CHECK-BE-NEXT: strne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: strmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: strmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s7 +; CHECK-BE-NEXT: strmi r1, [r0, #12] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %a, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %a, <4 x i32>* %dest, i32 1, <4 x i1> %c) + ret void +} + +define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_pre: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r1, s0 +; CHECK-LE-NEXT: strne r1, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s1 +; CHECK-LE-NEXT: strmi r1, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s2 +; CHECK-LE-NEXT: strmi r1, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s3 +; CHECK-LE-NEXT: strmi r1, [r0, #12] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_pre: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r1, s0 +; CHECK-BE-NEXT: strne r1, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s1 +; CHECK-BE-NEXT: strmi r1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s2 +; CHECK-BE-NEXT: strmi r1, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s3 +; CHECK-BE-NEXT: strmi r1, [r0, #12] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4i32_post: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne r1, s0 +; CHECK-LE-NEXT: strne r1, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r1, s1 +; CHECK-LE-NEXT: strmi r1, [r0, #4] +; CHECK-LE-NEXT: adds r1, r0, #4 +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r3, s2 +; CHECK-LE-NEXT: strmi r3, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi r2, s3 +; CHECK-LE-NEXT: strmi r2, [r0, #12] +; CHECK-LE-NEXT: mov r0, r1 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4i32_post: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne r1, s0 +; CHECK-BE-NEXT: strne r1, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r1, s1 +; CHECK-BE-NEXT: strmi r1, [r0, #4] +; CHECK-BE-NEXT: adds r1, r0, #4 +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r3, s2 +; CHECK-BE-NEXT: strmi r3, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi r2, s3 +; CHECK-BE-NEXT: strmi r2, [r0, #12] +; CHECK-BE-NEXT: mov r0, r1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + + +define arm_aapcs_vfpcc void @masked_v8i16(<8 x i16> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] +; CHECK-LE-NEXT: strhne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-LE-NEXT: strhmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-LE-NEXT: strhmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-LE-NEXT: strhmi r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-LE-NEXT: strhmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-LE-NEXT: strhmi r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-LE-NEXT: strhmi r2, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-LE-NEXT: strhmi r1, [r0, #14] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u16 r2, q1[0] +; CHECK-BE-NEXT: strhne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[1] +; CHECK-BE-NEXT: strhmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[2] +; CHECK-BE-NEXT: strhmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[3] +; CHECK-BE-NEXT: strhmi r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[4] +; CHECK-BE-NEXT: strhmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[5] +; CHECK-BE-NEXT: strhmi r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[6] +; CHECK-BE-NEXT: strhmi r2, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q1[7] +; CHECK-BE-NEXT: strhmi r1, [r0, #14] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %a, <8 x i16>* %dest, i32 2, <8 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] +; CHECK-LE-NEXT: strhne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-LE-NEXT: strhmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-LE-NEXT: strhmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-LE-NEXT: strhmi r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-LE-NEXT: strhmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-LE-NEXT: strhmi r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-LE-NEXT: strhmi r2, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-LE-NEXT: strhmi r1, [r0, #14] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u16 r2, q1[0] +; CHECK-BE-NEXT: strhne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[1] +; CHECK-BE-NEXT: strhmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[2] +; CHECK-BE-NEXT: strhmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[3] +; CHECK-BE-NEXT: strhmi r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[4] +; CHECK-BE-NEXT: strhmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[5] +; CHECK-BE-NEXT: strhmi r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q1[6] +; CHECK-BE-NEXT: strhmi r2, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q1[7] +; CHECK-BE-NEXT: strhmi r1, [r0, #14] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %a, zeroinitializer + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %a, <8 x i16>* %dest, i32 1, <8 x i1> %c) + ret void +} + +define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_pre: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u16 r1, q0[0] +; CHECK-LE-NEXT: strhne r1, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[1] +; CHECK-LE-NEXT: strhmi r1, [r0, #2] +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[2] +; CHECK-LE-NEXT: strhmi r1, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[3] +; CHECK-LE-NEXT: strhmi r1, [r0, #6] +; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[4] +; CHECK-LE-NEXT: strhmi r1, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[5] +; CHECK-LE-NEXT: strhmi r1, [r0, #10] +; CHECK-LE-NEXT: lsls r1, r2, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[6] +; CHECK-LE-NEXT: strhmi r1, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r2, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-LE-NEXT: strhmi r1, [r0, #14] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_pre: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u16 r1, q0[0] +; CHECK-BE-NEXT: strhne r1, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[1] +; CHECK-BE-NEXT: strhmi r1, [r0, #2] +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[2] +; CHECK-BE-NEXT: strhmi r1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[3] +; CHECK-BE-NEXT: strhmi r1, [r0, #6] +; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[4] +; CHECK-BE-NEXT: strhmi r1, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[5] +; CHECK-BE-NEXT: strhmi r1, [r0, #10] +; CHECK-BE-NEXT: lsls r1, r2, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[6] +; CHECK-BE-NEXT: strhmi r1, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r2, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[7] +; CHECK-BE-NEXT: strhmi r1, [r0, #14] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 4 + %2 = bitcast i8* %z to <8 x i16>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8i16_post: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u16 r1, q0[0] +; CHECK-LE-NEXT: strhne r1, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[1] +; CHECK-LE-NEXT: strhmi r1, [r0, #2] +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[2] +; CHECK-LE-NEXT: strhmi r1, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[3] +; CHECK-LE-NEXT: strhmi r1, [r0, #6] +; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[4] +; CHECK-LE-NEXT: strhmi r1, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r1, q0[5] +; CHECK-LE-NEXT: strhmi r1, [r0, #10] +; CHECK-LE-NEXT: adds r1, r0, #4 +; CHECK-LE-NEXT: lsls r3, r2, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r3, q0[6] +; CHECK-LE-NEXT: strhmi r3, [r0, #12] +; CHECK-LE-NEXT: lsls r2, r2, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[7] +; CHECK-LE-NEXT: strhmi r2, [r0, #14] +; CHECK-LE-NEXT: mov r0, r1 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8i16_post: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u16 r1, q0[0] +; CHECK-BE-NEXT: strhne r1, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[1] +; CHECK-BE-NEXT: strhmi r1, [r0, #2] +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[2] +; CHECK-BE-NEXT: strhmi r1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[3] +; CHECK-BE-NEXT: strhmi r1, [r0, #6] +; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[4] +; CHECK-BE-NEXT: strhmi r1, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r1, q0[5] +; CHECK-BE-NEXT: strhmi r1, [r0, #10] +; CHECK-BE-NEXT: adds r1, r0, #4 +; CHECK-BE-NEXT: lsls r3, r2, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r3, q0[6] +; CHECK-BE-NEXT: strhmi r3, [r0, #12] +; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u16 r2, q0[7] +; CHECK-BE-NEXT: strhmi r2, [r0, #14] +; CHECK-BE-NEXT: mov r0, r1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 4 + %2 = bitcast i8* %y to <8 x i16>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + + +define arm_aapcs_vfpcc void @masked_v16i8(<16 x i8> *%dest, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: ldrh.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u8 r2, q0[0] +; CHECK-LE-NEXT: strbne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[1] +; CHECK-LE-NEXT: strbmi r2, [r0, #1] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[2] +; CHECK-LE-NEXT: strbmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[3] +; CHECK-LE-NEXT: strbmi r2, [r0, #3] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[4] +; CHECK-LE-NEXT: strbmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[5] +; CHECK-LE-NEXT: strbmi r2, [r0, #5] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[6] +; CHECK-LE-NEXT: strbmi r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[7] +; CHECK-LE-NEXT: strbmi r2, [r0, #7] +; CHECK-LE-NEXT: lsls r2, r1, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[8] +; CHECK-LE-NEXT: strbmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[9] +; CHECK-LE-NEXT: strbmi r2, [r0, #9] +; CHECK-LE-NEXT: lsls r2, r1, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[10] +; CHECK-LE-NEXT: strbmi r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[11] +; CHECK-LE-NEXT: strbmi r2, [r0, #11] +; CHECK-LE-NEXT: lsls r2, r1, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[12] +; CHECK-LE-NEXT: strbmi r2, [r0, #12] +; CHECK-LE-NEXT: lsls r2, r1, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[13] +; CHECK-LE-NEXT: strbmi r2, [r0, #13] +; CHECK-LE-NEXT: lsls r2, r1, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[14] +; CHECK-LE-NEXT: strbmi r2, [r0, #14] +; CHECK-LE-NEXT: lsls r1, r1, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[15] +; CHECK-LE-NEXT: strbmi r1, [r0, #15] +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrh.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u8 r2, q1[0] +; CHECK-BE-NEXT: strbne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[1] +; CHECK-BE-NEXT: strbmi r2, [r0, #1] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[2] +; CHECK-BE-NEXT: strbmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[3] +; CHECK-BE-NEXT: strbmi r2, [r0, #3] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[4] +; CHECK-BE-NEXT: strbmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[5] +; CHECK-BE-NEXT: strbmi r2, [r0, #5] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[6] +; CHECK-BE-NEXT: strbmi r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[7] +; CHECK-BE-NEXT: strbmi r2, [r0, #7] +; CHECK-BE-NEXT: lsls r2, r1, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[8] +; CHECK-BE-NEXT: strbmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[9] +; CHECK-BE-NEXT: strbmi r2, [r0, #9] +; CHECK-BE-NEXT: lsls r2, r1, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[10] +; CHECK-BE-NEXT: strbmi r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[11] +; CHECK-BE-NEXT: strbmi r2, [r0, #11] +; CHECK-BE-NEXT: lsls r2, r1, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[12] +; CHECK-BE-NEXT: strbmi r2, [r0, #12] +; CHECK-BE-NEXT: lsls r2, r1, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[13] +; CHECK-BE-NEXT: strbmi r2, [r0, #13] +; CHECK-BE-NEXT: lsls r2, r1, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q1[14] +; CHECK-BE-NEXT: strbmi r2, [r0, #14] +; CHECK-BE-NEXT: lsls r1, r1, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q1[15] +; CHECK-BE-NEXT: strbmi r1, [r0, #15] +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %c = icmp sgt <16 x i8> %a, zeroinitializer + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %a, <16 x i8>* %dest, i32 1, <16 x i1> %c) + ret void +} + +define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8_pre: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: vldr d1, [r7, #8] +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrh.w r2, [sp] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u8 r1, q0[0] +; CHECK-LE-NEXT: strbne r1, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[1] +; CHECK-LE-NEXT: strbmi r1, [r0, #1] +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[2] +; CHECK-LE-NEXT: strbmi r1, [r0, #2] +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[3] +; CHECK-LE-NEXT: strbmi r1, [r0, #3] +; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[4] +; CHECK-LE-NEXT: strbmi r1, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[5] +; CHECK-LE-NEXT: strbmi r1, [r0, #5] +; CHECK-LE-NEXT: lsls r1, r2, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[6] +; CHECK-LE-NEXT: strbmi r1, [r0, #6] +; CHECK-LE-NEXT: lsls r1, r2, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[7] +; CHECK-LE-NEXT: strbmi r1, [r0, #7] +; CHECK-LE-NEXT: lsls r1, r2, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[8] +; CHECK-LE-NEXT: strbmi r1, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r2, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[9] +; CHECK-LE-NEXT: strbmi r1, [r0, #9] +; CHECK-LE-NEXT: lsls r1, r2, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[10] +; CHECK-LE-NEXT: strbmi r1, [r0, #10] +; CHECK-LE-NEXT: lsls r1, r2, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[11] +; CHECK-LE-NEXT: strbmi r1, [r0, #11] +; CHECK-LE-NEXT: lsls r1, r2, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[12] +; CHECK-LE-NEXT: strbmi r1, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r2, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[13] +; CHECK-LE-NEXT: strbmi r1, [r0, #13] +; CHECK-LE-NEXT: lsls r1, r2, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[14] +; CHECK-LE-NEXT: strbmi r1, [r0, #14] +; CHECK-LE-NEXT: lsls r1, r2, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[15] +; CHECK-LE-NEXT: strbmi r1, [r0, #15] +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8_pre: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vldr d1, [r7, #8] +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrh.w r2, [sp] +; CHECK-BE-NEXT: vldrb.u8 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u8 r1, q0[0] +; CHECK-BE-NEXT: strbne r1, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[1] +; CHECK-BE-NEXT: strbmi r1, [r0, #1] +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[2] +; CHECK-BE-NEXT: strbmi r1, [r0, #2] +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[3] +; CHECK-BE-NEXT: strbmi r1, [r0, #3] +; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[4] +; CHECK-BE-NEXT: strbmi r1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[5] +; CHECK-BE-NEXT: strbmi r1, [r0, #5] +; CHECK-BE-NEXT: lsls r1, r2, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[6] +; CHECK-BE-NEXT: strbmi r1, [r0, #6] +; CHECK-BE-NEXT: lsls r1, r2, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[7] +; CHECK-BE-NEXT: strbmi r1, [r0, #7] +; CHECK-BE-NEXT: lsls r1, r2, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[8] +; CHECK-BE-NEXT: strbmi r1, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r2, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[9] +; CHECK-BE-NEXT: strbmi r1, [r0, #9] +; CHECK-BE-NEXT: lsls r1, r2, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[10] +; CHECK-BE-NEXT: strbmi r1, [r0, #10] +; CHECK-BE-NEXT: lsls r1, r2, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[11] +; CHECK-BE-NEXT: strbmi r1, [r0, #11] +; CHECK-BE-NEXT: lsls r1, r2, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[12] +; CHECK-BE-NEXT: strbmi r1, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r2, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[13] +; CHECK-BE-NEXT: strbmi r1, [r0, #13] +; CHECK-BE-NEXT: lsls r1, r2, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[14] +; CHECK-BE-NEXT: strbmi r1, [r0, #14] +; CHECK-BE-NEXT: lsls r1, r2, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[15] +; CHECK-BE-NEXT: strbmi r1, [r0, #15] +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 4 + %2 = bitcast i8* %z to <16 x i8>* + %c = icmp sgt <16 x i8> %a, zeroinitializer + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) { +; CHECK-LE-LABEL: masked_v16i8_post: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: vldr d1, [r7, #8] +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrh.w r2, [sp] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: itt ne +; CHECK-LE-NEXT: vmovne.u8 r1, q0[0] +; CHECK-LE-NEXT: strbne r1, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[1] +; CHECK-LE-NEXT: strbmi r1, [r0, #1] +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[2] +; CHECK-LE-NEXT: strbmi r1, [r0, #2] +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[3] +; CHECK-LE-NEXT: strbmi r1, [r0, #3] +; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[4] +; CHECK-LE-NEXT: strbmi r1, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[5] +; CHECK-LE-NEXT: strbmi r1, [r0, #5] +; CHECK-LE-NEXT: lsls r1, r2, #25 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[6] +; CHECK-LE-NEXT: strbmi r1, [r0, #6] +; CHECK-LE-NEXT: lsls r1, r2, #24 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[7] +; CHECK-LE-NEXT: strbmi r1, [r0, #7] +; CHECK-LE-NEXT: lsls r1, r2, #23 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[8] +; CHECK-LE-NEXT: strbmi r1, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r2, #22 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[9] +; CHECK-LE-NEXT: strbmi r1, [r0, #9] +; CHECK-LE-NEXT: lsls r1, r2, #21 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[10] +; CHECK-LE-NEXT: strbmi r1, [r0, #10] +; CHECK-LE-NEXT: lsls r1, r2, #20 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[11] +; CHECK-LE-NEXT: strbmi r1, [r0, #11] +; CHECK-LE-NEXT: lsls r1, r2, #19 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[12] +; CHECK-LE-NEXT: strbmi r1, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r2, #18 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r1, q0[13] +; CHECK-LE-NEXT: strbmi r1, [r0, #13] +; CHECK-LE-NEXT: adds r1, r0, #4 +; CHECK-LE-NEXT: lsls r3, r2, #17 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r3, q0[14] +; CHECK-LE-NEXT: strbmi r3, [r0, #14] +; CHECK-LE-NEXT: lsls r2, r2, #16 +; CHECK-LE-NEXT: itt mi +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[15] +; CHECK-LE-NEXT: strbmi r2, [r0, #15] +; CHECK-LE-NEXT: mov r0, r1 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: masked_v16i8_post: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vldr d1, [r7, #8] +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrh.w r2, [sp] +; CHECK-BE-NEXT: vldrb.u8 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: itt ne +; CHECK-BE-NEXT: vmovne.u8 r1, q0[0] +; CHECK-BE-NEXT: strbne r1, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[1] +; CHECK-BE-NEXT: strbmi r1, [r0, #1] +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[2] +; CHECK-BE-NEXT: strbmi r1, [r0, #2] +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[3] +; CHECK-BE-NEXT: strbmi r1, [r0, #3] +; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[4] +; CHECK-BE-NEXT: strbmi r1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[5] +; CHECK-BE-NEXT: strbmi r1, [r0, #5] +; CHECK-BE-NEXT: lsls r1, r2, #25 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[6] +; CHECK-BE-NEXT: strbmi r1, [r0, #6] +; CHECK-BE-NEXT: lsls r1, r2, #24 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[7] +; CHECK-BE-NEXT: strbmi r1, [r0, #7] +; CHECK-BE-NEXT: lsls r1, r2, #23 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[8] +; CHECK-BE-NEXT: strbmi r1, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r2, #22 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[9] +; CHECK-BE-NEXT: strbmi r1, [r0, #9] +; CHECK-BE-NEXT: lsls r1, r2, #21 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[10] +; CHECK-BE-NEXT: strbmi r1, [r0, #10] +; CHECK-BE-NEXT: lsls r1, r2, #20 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[11] +; CHECK-BE-NEXT: strbmi r1, [r0, #11] +; CHECK-BE-NEXT: lsls r1, r2, #19 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[12] +; CHECK-BE-NEXT: strbmi r1, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r2, #18 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r1, q0[13] +; CHECK-BE-NEXT: strbmi r1, [r0, #13] +; CHECK-BE-NEXT: adds r1, r0, #4 +; CHECK-BE-NEXT: lsls r3, r2, #17 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r3, q0[14] +; CHECK-BE-NEXT: strbmi r3, [r0, #14] +; CHECK-BE-NEXT: lsls r2, r2, #16 +; CHECK-BE-NEXT: itt mi +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[15] +; CHECK-BE-NEXT: strbmi r2, [r0, #15] +; CHECK-BE-NEXT: mov r0, r1 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 4 + %2 = bitcast i8* %y to <16 x i8>* + %c = icmp sgt <16 x i8> %a, zeroinitializer + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + + +define arm_aapcs_vfpcc void @masked_v4f32(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) { +; CHECK-LE-LABEL: masked_v4f32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne s0, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s1, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s3, [r0, #12] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne s4, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s5, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s6, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s7, [r0, #12] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp ugt <4 x i32> %b, zeroinitializer + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %a, <4 x float>* %dest, i32 4, <4 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) { +; CHECK-LE-LABEL: masked_v4f32_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #20 +; CHECK-LE-NEXT: sub sp, #20 +; CHECK-LE-NEXT: add r1, sp, #16 +; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp, #16] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: ittt ne +; CHECK-LE-NEXT: vstrne s0, [sp, #12] +; CHECK-LE-NEXT: ldrne r2, [sp, #12] +; CHECK-LE-NEXT: strne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: ittt mi +; CHECK-LE-NEXT: vstrmi s1, [sp, #8] +; CHECK-LE-NEXT: ldrmi r2, [sp, #8] +; CHECK-LE-NEXT: strmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: ittt mi +; CHECK-LE-NEXT: vstrmi s2, [sp, #4] +; CHECK-LE-NEXT: ldrmi r2, [sp, #4] +; CHECK-LE-NEXT: strmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 +; CHECK-LE-NEXT: ittt mi +; CHECK-LE-NEXT: vstrmi s3, [sp] +; CHECK-LE-NEXT: ldrmi r1, [sp] +; CHECK-LE-NEXT: strmi r1, [r0, #12] +; CHECK-LE-NEXT: add sp, #20 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #20 +; CHECK-BE-NEXT: sub sp, #20 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: add r1, sp, #16 +; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp, #16] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: ittt ne +; CHECK-BE-NEXT: vstrne s4, [sp, #12] +; CHECK-BE-NEXT: ldrne r2, [sp, #12] +; CHECK-BE-NEXT: strne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: ittt mi +; CHECK-BE-NEXT: vstrmi s5, [sp, #8] +; CHECK-BE-NEXT: ldrmi r2, [sp, #8] +; CHECK-BE-NEXT: strmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: ittt mi +; CHECK-BE-NEXT: vstrmi s6, [sp, #4] +; CHECK-BE-NEXT: ldrmi r2, [sp, #4] +; CHECK-BE-NEXT: strmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 +; CHECK-BE-NEXT: ittt mi +; CHECK-BE-NEXT: vstrmi s7, [sp] +; CHECK-BE-NEXT: ldrmi r1, [sp] +; CHECK-BE-NEXT: strmi r1, [r0, #12] +; CHECK-BE-NEXT: add sp, #20 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp ugt <4 x i32> %b, zeroinitializer + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %a, <4 x float>* %dest, i32 1, <4 x i1> %c) + ret void +} + +define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4f32_pre: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne s0, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s1, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s3, [r0, #12] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_pre: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne s0, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s3, [r0, #12] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) { +; CHECK-LE-LABEL: masked_v4f32_post: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne s0, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s1, [r0, #4] +; CHECK-LE-NEXT: adds r1, r0, #4 +; CHECK-LE-NEXT: lsls r3, r2, #29 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r2, #28 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi s3, [r0, #12] +; CHECK-LE-NEXT: mov r0, r1 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v4f32_post: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne s0, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s1, [r0, #4] +; CHECK-BE-NEXT: adds r1, r0, #4 +; CHECK-BE-NEXT: lsls r3, r2, #29 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r2, #28 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi s3, [r0, #12] +; CHECK-BE-NEXT: mov r0, r1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + %c = icmp sgt <4 x i32> %a, zeroinitializer + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + + +define arm_aapcs_vfpcc void @masked_v8f16(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) { +; CHECK-LE-LABEL: masked_v8f16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB15_9 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB15_10 +; CHECK-LE-NEXT: .LBB15_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB15_11 +; CHECK-LE-NEXT: .LBB15_3: @ %else4 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB15_12 +; CHECK-LE-NEXT: .LBB15_4: @ %else6 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB15_13 +; CHECK-LE-NEXT: .LBB15_5: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB15_14 +; CHECK-LE-NEXT: .LBB15_6: @ %else10 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB15_15 +; CHECK-LE-NEXT: .LBB15_7: @ %else12 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB15_16 +; CHECK-LE-NEXT: .LBB15_8: @ %else14 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB15_9: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s0, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB15_2 +; CHECK-LE-NEXT: .LBB15_10: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s4, s0 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB15_3 +; CHECK-LE-NEXT: .LBB15_11: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB15_4 +; CHECK-LE-NEXT: .LBB15_12: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s4, s1 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB15_5 +; CHECK-LE-NEXT: .LBB15_13: @ %cond.store7 +; CHECK-LE-NEXT: vstr.16 s2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB15_6 +; CHECK-LE-NEXT: .LBB15_14: @ %cond.store9 +; CHECK-LE-NEXT: vmovx.f16 s4, s2 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB15_7 +; CHECK-LE-NEXT: .LBB15_15: @ %cond.store11 +; CHECK-LE-NEXT: vstr.16 s3, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB15_8 +; CHECK-LE-NEXT: .LBB15_16: @ %cond.store13 +; CHECK-LE-NEXT: vmovx.f16 s0, s3 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #14] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8f16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q2, q1 +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB15_9 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB15_10 +; CHECK-BE-NEXT: .LBB15_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB15_11 +; CHECK-BE-NEXT: .LBB15_3: @ %else4 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB15_12 +; CHECK-BE-NEXT: .LBB15_4: @ %else6 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB15_13 +; CHECK-BE-NEXT: .LBB15_5: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB15_14 +; CHECK-BE-NEXT: .LBB15_6: @ %else10 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB15_15 +; CHECK-BE-NEXT: .LBB15_7: @ %else12 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB15_16 +; CHECK-BE-NEXT: .LBB15_8: @ %else14 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB15_9: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s4, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB15_2 +; CHECK-BE-NEXT: .LBB15_10: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s0, s4 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB15_3 +; CHECK-BE-NEXT: .LBB15_11: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s5, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB15_4 +; CHECK-BE-NEXT: .LBB15_12: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s5 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB15_5 +; CHECK-BE-NEXT: .LBB15_13: @ %cond.store7 +; CHECK-BE-NEXT: vstr.16 s6, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB15_6 +; CHECK-BE-NEXT: .LBB15_14: @ %cond.store9 +; CHECK-BE-NEXT: vmovx.f16 s0, s6 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB15_7 +; CHECK-BE-NEXT: .LBB15_15: @ %cond.store11 +; CHECK-BE-NEXT: vstr.16 s7, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB15_8 +; CHECK-BE-NEXT: .LBB15_16: @ %cond.store13 +; CHECK-BE-NEXT: vmovx.f16 s0, s7 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #14] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp ugt <8 x i16> %b, zeroinitializer + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %a, <8 x half>* %dest, i32 2, <8 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) { +; CHECK-LE-LABEL: masked_v8f16_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: add r1, sp, #32 +; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp, #32] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB16_9 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB16_10 +; CHECK-LE-NEXT: .LBB16_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB16_11 +; CHECK-LE-NEXT: .LBB16_3: @ %else4 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB16_12 +; CHECK-LE-NEXT: .LBB16_4: @ %else6 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB16_13 +; CHECK-LE-NEXT: .LBB16_5: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB16_14 +; CHECK-LE-NEXT: .LBB16_6: @ %else10 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB16_15 +; CHECK-LE-NEXT: .LBB16_7: @ %else12 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB16_16 +; CHECK-LE-NEXT: .LBB16_8: @ %else14 +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB16_9: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s0, [sp, #28] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #28] +; CHECK-LE-NEXT: strh r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB16_2 +; CHECK-LE-NEXT: .LBB16_10: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s4, s0 +; CHECK-LE-NEXT: vstr.16 s4, [sp, #24] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #24] +; CHECK-LE-NEXT: strh r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB16_3 +; CHECK-LE-NEXT: .LBB16_11: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s1, [sp, #20] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #20] +; CHECK-LE-NEXT: strh r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB16_4 +; CHECK-LE-NEXT: .LBB16_12: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s4, s1 +; CHECK-LE-NEXT: vstr.16 s4, [sp, #16] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #16] +; CHECK-LE-NEXT: strh r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB16_5 +; CHECK-LE-NEXT: .LBB16_13: @ %cond.store7 +; CHECK-LE-NEXT: vstr.16 s2, [sp, #12] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #12] +; CHECK-LE-NEXT: strh r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB16_6 +; CHECK-LE-NEXT: .LBB16_14: @ %cond.store9 +; CHECK-LE-NEXT: vmovx.f16 s4, s2 +; CHECK-LE-NEXT: vstr.16 s4, [sp, #8] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #8] +; CHECK-LE-NEXT: strh r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB16_7 +; CHECK-LE-NEXT: .LBB16_15: @ %cond.store11 +; CHECK-LE-NEXT: vstr.16 s3, [sp, #4] +; CHECK-LE-NEXT: ldrh.w r2, [sp, #4] +; CHECK-LE-NEXT: strh r2, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB16_8 +; CHECK-LE-NEXT: .LBB16_16: @ %cond.store13 +; CHECK-LE-NEXT: vmovx.f16 s0, s3 +; CHECK-LE-NEXT: vstr.16 s0, [sp] +; CHECK-LE-NEXT: ldrh.w r1, [sp] +; CHECK-LE-NEXT: strh r1, [r0, #14] +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8f16_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vrev64.16 q2, q1 +; CHECK-BE-NEXT: add r1, sp, #32 +; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vstr p0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp, #32] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB16_9 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB16_10 +; CHECK-BE-NEXT: .LBB16_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB16_11 +; CHECK-BE-NEXT: .LBB16_3: @ %else4 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB16_12 +; CHECK-BE-NEXT: .LBB16_4: @ %else6 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB16_13 +; CHECK-BE-NEXT: .LBB16_5: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB16_14 +; CHECK-BE-NEXT: .LBB16_6: @ %else10 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB16_15 +; CHECK-BE-NEXT: .LBB16_7: @ %else12 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB16_16 +; CHECK-BE-NEXT: .LBB16_8: @ %else14 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB16_9: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s4, [sp, #28] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #28] +; CHECK-BE-NEXT: strh r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB16_2 +; CHECK-BE-NEXT: .LBB16_10: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s0, s4 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #24] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #24] +; CHECK-BE-NEXT: strh r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB16_3 +; CHECK-BE-NEXT: .LBB16_11: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s5, [sp, #20] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #20] +; CHECK-BE-NEXT: strh r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB16_4 +; CHECK-BE-NEXT: .LBB16_12: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s0, s5 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #16] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #16] +; CHECK-BE-NEXT: strh r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB16_5 +; CHECK-BE-NEXT: .LBB16_13: @ %cond.store7 +; CHECK-BE-NEXT: vstr.16 s6, [sp, #12] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #12] +; CHECK-BE-NEXT: strh r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB16_6 +; CHECK-BE-NEXT: .LBB16_14: @ %cond.store9 +; CHECK-BE-NEXT: vmovx.f16 s0, s6 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #8] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #8] +; CHECK-BE-NEXT: strh r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB16_7 +; CHECK-BE-NEXT: .LBB16_15: @ %cond.store11 +; CHECK-BE-NEXT: vstr.16 s7, [sp, #4] +; CHECK-BE-NEXT: ldrh.w r2, [sp, #4] +; CHECK-BE-NEXT: strh r2, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB16_8 +; CHECK-BE-NEXT: .LBB16_16: @ %cond.store13 +; CHECK-BE-NEXT: vmovx.f16 s0, s7 +; CHECK-BE-NEXT: vstr.16 s0, [sp] +; CHECK-BE-NEXT: ldrh.w r1, [sp] +; CHECK-BE-NEXT: strh r1, [r0, #14] +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp ugt <8 x i16> %b, zeroinitializer + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %a, <8 x half>* %dest, i32 1, <8 x i1> %c) + ret void +} + +define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8f16_pre: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: adds r0, #4 +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bne .LBB17_9 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bmi .LBB17_10 +; CHECK-LE-NEXT: .LBB17_2: @ %else2 +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bmi .LBB17_11 +; CHECK-LE-NEXT: .LBB17_3: @ %else4 +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bmi .LBB17_12 +; CHECK-LE-NEXT: .LBB17_4: @ %else6 +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bmi .LBB17_13 +; CHECK-LE-NEXT: .LBB17_5: @ %else8 +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bmi .LBB17_14 +; CHECK-LE-NEXT: .LBB17_6: @ %else10 +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bmi .LBB17_15 +; CHECK-LE-NEXT: .LBB17_7: @ %else12 +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bmi .LBB17_16 +; CHECK-LE-NEXT: .LBB17_8: @ %else14 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB17_9: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s0, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 +; CHECK-LE-NEXT: bpl .LBB17_2 +; CHECK-LE-NEXT: .LBB17_10: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s4, s0 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 +; CHECK-LE-NEXT: bpl .LBB17_3 +; CHECK-LE-NEXT: .LBB17_11: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 +; CHECK-LE-NEXT: bpl .LBB17_4 +; CHECK-LE-NEXT: .LBB17_12: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s4, s1 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 +; CHECK-LE-NEXT: bpl .LBB17_5 +; CHECK-LE-NEXT: .LBB17_13: @ %cond.store7 +; CHECK-LE-NEXT: vstr.16 s2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 +; CHECK-LE-NEXT: bpl .LBB17_6 +; CHECK-LE-NEXT: .LBB17_14: @ %cond.store9 +; CHECK-LE-NEXT: vmovx.f16 s4, s2 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 +; CHECK-LE-NEXT: bpl .LBB17_7 +; CHECK-LE-NEXT: .LBB17_15: @ %cond.store11 +; CHECK-LE-NEXT: vstr.16 s3, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 +; CHECK-LE-NEXT: bpl .LBB17_8 +; CHECK-LE-NEXT: .LBB17_16: @ %cond.store13 +; CHECK-LE-NEXT: vmovx.f16 s0, s3 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #14] +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v8f16_pre: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: adds r0, #4 +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bne .LBB17_9 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bmi .LBB17_10 +; CHECK-BE-NEXT: .LBB17_2: @ %else2 +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bmi .LBB17_11 +; CHECK-BE-NEXT: .LBB17_3: @ %else4 +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bmi .LBB17_12 +; CHECK-BE-NEXT: .LBB17_4: @ %else6 +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bmi .LBB17_13 +; CHECK-BE-NEXT: .LBB17_5: @ %else8 +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bmi .LBB17_14 +; CHECK-BE-NEXT: .LBB17_6: @ %else10 +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bmi .LBB17_15 +; CHECK-BE-NEXT: .LBB17_7: @ %else12 +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bmi .LBB17_16 +; CHECK-BE-NEXT: .LBB17_8: @ %else14 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB17_9: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s0, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 +; CHECK-BE-NEXT: bpl .LBB17_2 +; CHECK-BE-NEXT: .LBB17_10: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s4, s0 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 +; CHECK-BE-NEXT: bpl .LBB17_3 +; CHECK-BE-NEXT: .LBB17_11: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 +; CHECK-BE-NEXT: bpl .LBB17_4 +; CHECK-BE-NEXT: .LBB17_12: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s4, s1 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 +; CHECK-BE-NEXT: bpl .LBB17_5 +; CHECK-BE-NEXT: .LBB17_13: @ %cond.store7 +; CHECK-BE-NEXT: vstr.16 s2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 +; CHECK-BE-NEXT: bpl .LBB17_6 +; CHECK-BE-NEXT: .LBB17_14: @ %cond.store9 +; CHECK-BE-NEXT: vmovx.f16 s4, s2 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 +; CHECK-BE-NEXT: bpl .LBB17_7 +; CHECK-BE-NEXT: .LBB17_15: @ %cond.store11 +; CHECK-BE-NEXT: vstr.16 s3, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 +; CHECK-BE-NEXT: bpl .LBB17_8 +; CHECK-BE-NEXT: .LBB17_16: @ %cond.store13 +; CHECK-BE-NEXT: vmovx.f16 s0, s3 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #14] +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 4 + %2 = bitcast i8* %z to <8 x half>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) { +; CHECK-LE-LABEL: masked_v8f16_post: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vldr d1, [sp, #8] +; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr +; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: bne .LBB18_12 +; CHECK-LE-NEXT: @ %bb.1: @ %else +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: bmi .LBB18_13 +; CHECK-LE-NEXT: .LBB18_2: @ %else2 +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: bmi .LBB18_14 +; CHECK-LE-NEXT: .LBB18_3: @ %else4 +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: bmi .LBB18_15 +; CHECK-LE-NEXT: .LBB18_4: @ %else6 +; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: bmi .LBB18_16 +; CHECK-LE-NEXT: .LBB18_5: @ %else8 +; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: bpl .LBB18_7 +; CHECK-LE-NEXT: .LBB18_6: @ %cond.store9 +; CHECK-LE-NEXT: vmovx.f16 s4, s2 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #10] +; CHECK-LE-NEXT: .LBB18_7: @ %else10 +; CHECK-LE-NEXT: adds r1, r0, #4 +; CHECK-LE-NEXT: lsls r3, r2, #25 +; CHECK-LE-NEXT: bpl .LBB18_9 +; CHECK-LE-NEXT: @ %bb.8: @ %cond.store11 +; CHECK-LE-NEXT: vstr.16 s3, [r0, #12] +; CHECK-LE-NEXT: .LBB18_9: @ %else12 +; CHECK-LE-NEXT: lsls r2, r2, #24 +; CHECK-LE-NEXT: bpl .LBB18_11 +; CHECK-LE-NEXT: @ %bb.10: @ %cond.store13 +; CHECK-LE-NEXT: vmovx.f16 s0, s3 +; CHECK-LE-NEXT: vstr.16 s0, [r0, #14] +; CHECK-LE-NEXT: .LBB18_11: @ %else14 +; CHECK-LE-NEXT: mov r0, r1 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .LBB18_12: @ %cond.store +; CHECK-LE-NEXT: vstr.16 s0, [r0] +; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: bpl .LBB18_2 +; CHECK-LE-NEXT: .LBB18_13: @ %cond.store1 +; CHECK-LE-NEXT: vmovx.f16 s4, s0 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: bpl .LBB18_3 +; CHECK-LE-NEXT: .LBB18_14: @ %cond.store3 +; CHECK-LE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: bpl .LBB18_4 +; CHECK-LE-NEXT: .LBB18_15: @ %cond.store5 +; CHECK-LE-NEXT: vmovx.f16 s4, s1 +; CHECK-LE-NEXT: vstr.16 s4, [r0, #6] +; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: bpl .LBB18_5 +; CHECK-LE-NEXT: .LBB18_16: @ %cond.store7 +; CHECK-LE-NEXT: vstr.16 s2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: bmi .LBB18_6 +; CHECK-LE-NEXT: b .LBB18_7 +; +; CHECK-BE-LABEL: masked_v8f16_post: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vldr d1, [sp, #8] +; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr +; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vldrh.u16 q0, [r1] +; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: bne .LBB18_12 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: bmi .LBB18_13 +; CHECK-BE-NEXT: .LBB18_2: @ %else2 +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: bmi .LBB18_14 +; CHECK-BE-NEXT: .LBB18_3: @ %else4 +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: bmi .LBB18_15 +; CHECK-BE-NEXT: .LBB18_4: @ %else6 +; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: bmi .LBB18_16 +; CHECK-BE-NEXT: .LBB18_5: @ %else8 +; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: bpl .LBB18_7 +; CHECK-BE-NEXT: .LBB18_6: @ %cond.store9 +; CHECK-BE-NEXT: vmovx.f16 s4, s2 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #10] +; CHECK-BE-NEXT: .LBB18_7: @ %else10 +; CHECK-BE-NEXT: adds r1, r0, #4 +; CHECK-BE-NEXT: lsls r3, r2, #25 +; CHECK-BE-NEXT: bpl .LBB18_9 +; CHECK-BE-NEXT: @ %bb.8: @ %cond.store11 +; CHECK-BE-NEXT: vstr.16 s3, [r0, #12] +; CHECK-BE-NEXT: .LBB18_9: @ %else12 +; CHECK-BE-NEXT: lsls r2, r2, #24 +; CHECK-BE-NEXT: bpl .LBB18_11 +; CHECK-BE-NEXT: @ %bb.10: @ %cond.store13 +; CHECK-BE-NEXT: vmovx.f16 s0, s3 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #14] +; CHECK-BE-NEXT: .LBB18_11: @ %else14 +; CHECK-BE-NEXT: mov r0, r1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB18_12: @ %cond.store +; CHECK-BE-NEXT: vstr.16 s0, [r0] +; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: bpl .LBB18_2 +; CHECK-BE-NEXT: .LBB18_13: @ %cond.store1 +; CHECK-BE-NEXT: vmovx.f16 s4, s0 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: bpl .LBB18_3 +; CHECK-BE-NEXT: .LBB18_14: @ %cond.store3 +; CHECK-BE-NEXT: vstr.16 s1, [r0, #4] +; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: bpl .LBB18_4 +; CHECK-BE-NEXT: .LBB18_15: @ %cond.store5 +; CHECK-BE-NEXT: vmovx.f16 s4, s1 +; CHECK-BE-NEXT: vstr.16 s4, [r0, #6] +; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: bpl .LBB18_5 +; CHECK-BE-NEXT: .LBB18_16: @ %cond.store7 +; CHECK-BE-NEXT: vstr.16 s2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: bmi .LBB18_6 +; CHECK-BE-NEXT: b .LBB18_7 +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 4 + %2 = bitcast i8* %y to <8 x half>* + %c = icmp sgt <8 x i16> %a, zeroinitializer + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + + +define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) { +; CHECK-LE-LABEL: masked_v2i64: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov r1, s1 +; CHECK-LE-NEXT: vmov r12, s3 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: sbcs.w r1, r3, r1 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r3, #1 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r3, #3 +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: ittt ne +; CHECK-LE-NEXT: vmovne r2, s1 +; CHECK-LE-NEXT: vmovne r3, s0 +; CHECK-LE-NEXT: strdne r3, r2, [r0] +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: ittt mi +; CHECK-LE-NEXT: vmovmi r1, s3 +; CHECK-LE-NEXT: vmovmi r2, s2 +; CHECK-LE-NEXT: strdmi r2, r1, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v2i64: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: movs r3, #0 +; CHECK-BE-NEXT: vmov r2, s7 +; CHECK-BE-NEXT: vmov r1, s6 +; CHECK-BE-NEXT: vmov r12, s4 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: vmov r2, s5 +; CHECK-BE-NEXT: sbcs.w r1, r3, r1 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r1, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r3, r12 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: bne .LBB19_3 +; CHECK-BE-NEXT: @ %bb.1: @ %else +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: bmi .LBB19_4 +; CHECK-BE-NEXT: .LBB19_2: @ %else2 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .LBB19_3: @ %cond.store +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov r2, s5 +; CHECK-BE-NEXT: vmov r3, s4 +; CHECK-BE-NEXT: strd r3, r2, [r0] +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: bpl .LBB19_2 +; CHECK-BE-NEXT: .LBB19_4: @ %cond.store1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov r1, s7 +; CHECK-BE-NEXT: vmov r2, s6 +; CHECK-BE-NEXT: strd r2, r1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <2 x i64> %a, zeroinitializer + call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %a, <2 x i64>* %dest, i32 8, <2 x i1> %c) + ret void +} + +define arm_aapcs_vfpcc void @masked_v2f64(<2 x double> *%dest, <2 x double> %a, <2 x i64> %b) { +; CHECK-LE-LABEL: masked_v2f64: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov r1, s5 +; CHECK-LE-NEXT: vmov r12, s7 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: vmov r2, s6 +; CHECK-LE-NEXT: sbcs.w r1, r3, r1 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 +; CHECK-LE-NEXT: it lt +; CHECK-LE-NEXT: movlt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r3, #1 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r3, #3 +; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne d0, [r0] +; CHECK-LE-NEXT: lsls r1, r1, #30 +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi d1, [r0, #8] +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: masked_v2f64: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: movs r3, #0 +; CHECK-BE-NEXT: vmov r2, s11 +; CHECK-BE-NEXT: vmov r1, s10 +; CHECK-BE-NEXT: vmov r12, s8 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: vmov r2, s9 +; CHECK-BE-NEXT: sbcs.w r1, r3, r1 +; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r1, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r2, r3, r12 +; CHECK-BE-NEXT: it lt +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne d0, [r0] +; CHECK-BE-NEXT: lsls r1, r1, #30 +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi d1, [r0, #8] +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp sgt <2 x i64> %b, zeroinitializer + call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %a, <2 x double>* %dest, i32 8, <2 x i1> %c) + ret void +} + + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll index 4aa91c9f2eb1b..0b950ad62df62 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll @@ -384,4 +384,24 @@ entry: ret <2 x i64> %s } - +define arm_aapcs_vfpcc <4 x i32> @vpnot_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: vpnot_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmp.s32 lt, q0, zr +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.s32 gt, q1, zr +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i32 eq, q2, zr +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c1 = icmp slt <4 x i32> %a, zeroinitializer + %c2 = icmp sgt <4 x i32> %b, zeroinitializer + %c3 = icmp eq <4 x i32> %c, zeroinitializer + %o1 = and <4 x i1> %c1, %c2 + %o2 = xor <4 x i1> %o1, + %o = and <4 x i1> %c3, %o2 + %s = select <4 x i1> %o, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block.mir index 07f102cbc5668..1fc13c8aba0c5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s --- | @@ -61,9 +62,14 @@ body: | bb.0.entry: liveins: $q0, $q1, $q2, $r0 - ; CHECK: MVE_VPST 8, implicit-def $p0 - ; CHECK-NEXT: $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 - + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: liveins: $q0, $q1, $q2, $r0 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q1, implicit killed $q2, implicit killed $vpr, implicit killed $q0 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + ; CHECK: tBX_RET 14, $noreg, implicit internal $q0 + ; CHECK: } $vpr = VMSR_P0 killed $r0, 14, $noreg renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 tBX_RET 14, $noreg, implicit $q0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block2.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block2.mir index 7146ecd927f69..1e14336f4c943 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block2.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block2.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s --- | @@ -64,10 +65,16 @@ body: | bb.0.entry: liveins: $q0, $q1, $q2, $q3, $r0 - ; CHECK: MVE_VPST 4, implicit-def $p0 - ; CHECK-NEXT: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 - + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: liveins: $q0, $q1, $q2, $q3, $r0 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit-def dead $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit killed $q2, implicit killed $q3, implicit killed $vpr, implicit killed $q0, implicit killed $q1 { + ; CHECK: MVE_VPST 4, implicit-def $p0 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q3, 1, renamable $vpr, killed renamable $q0 + ; CHECK: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q0, killed renamable $q3, 1, killed renamable $vpr, killed renamable $q1 + ; CHECK: $q0 = MVE_VORR internal killed $q1, internal killed $q1, 0, $noreg, internal undef $q0 + ; CHECK: } + ; CHECK: tBX_RET 14, $noreg, implicit $q0 $vpr = VMSR_P0 killed $r0, 14, $noreg renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q3, 1, renamable $vpr, killed renamable $q0 renamable $q1 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q0, killed renamable $q3, 1, killed renamable $vpr, killed renamable $q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block3.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block3.mir index 2b8a4dda66373..319846942f45b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block3.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block3.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s --- | @@ -65,12 +66,18 @@ body: | bb.0.entry: liveins: $q0, $q1, $q2, $q3, $r0 - ; CHECK: MVE_VPST 1, implicit-def $p0 - ; CHECK-NEXT: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 - + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: liveins: $q0, $q1, $q2, $q3, $r0 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $q2, implicit-def $d4, implicit-def $s8, implicit-def $s9, implicit-def $d5, implicit-def $s10, implicit-def $s11, implicit-def dead $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit-def $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit killed $q2, implicit killed $q3, implicit killed $vpr, implicit killed $q0, implicit killed $q1 { + ; CHECK: MVE_VPST 1, implicit-def $p0 + ; CHECK: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q3, 1, renamable $vpr, undef renamable $q2 + ; CHECK: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q2, internal renamable $q2, 1, renamable $vpr, internal undef renamable $q2 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q2, renamable $q3, 1, renamable $vpr, killed renamable $q0 + ; CHECK: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q0, killed renamable $q3, 1, killed renamable $vpr, killed renamable $q1 + ; CHECK: } + ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 + ; CHECK: tBX_RET 14, $noreg, implicit $q0 $vpr = VMSR_P0 killed $r0, 14, $noreg renamable $q2 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q3, 1, renamable $vpr, undef renamable $q2 renamable $q2 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q2, 1, renamable $vpr, undef renamable $q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block4.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block4.mir index 389fb341fd91d..efc30a2a84e7d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block4.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block4.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s --- | @@ -66,18 +67,22 @@ body: | bb.0.entry: liveins: $q0, $q1, $q2, $q3, $r0 - ; CHECK: MVE_VPST 1, implicit-def $p0 - ; CHECK-NEXT: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE {{.*}} { - ; CHECK-NEXT: MVE_VPST 8, implicit-def $p0 - ; CHECK-NEXT: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 - ; CHECK-NEXT: $q0 = MVE_VORR - ; CHECK-NEXT: } - + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: liveins: $q0, $q1, $q2, $q3, $r0 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $q2, implicit-def $d4, implicit-def $s8, implicit-def $s9, implicit-def $d5, implicit-def $s10, implicit-def $s11, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q2, implicit $q3, implicit $vpr, implicit killed $q0 { + ; CHECK: MVE_VPST 1, implicit-def $p0 + ; CHECK: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q3, 1, renamable $vpr, undef renamable $q2 + ; CHECK: renamable $q2 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q2, internal renamable $q2, 1, renamable $vpr, internal undef renamable $q2 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q2, renamable $q3, 1, renamable $vpr, killed renamable $q0 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q0, renamable $q3, 1, renamable $vpr, internal undef renamable $q0 + ; CHECK: } + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q0, implicit killed $q3, implicit killed $vpr, implicit killed $q1 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q0, killed renamable $q3, 1, killed renamable $vpr, killed renamable $q1 + ; CHECK: $q0 = MVE_VORR internal killed $q1, internal killed $q1, 0, $noreg, undef $q0 + ; CHECK: } + ; CHECK: tBX_RET 14, $noreg, implicit $q0 $vpr = VMSR_P0 killed $r0, 14, $noreg renamable $q2 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q3, 1, renamable $vpr, undef renamable $q2 renamable $q2 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q2, renamable $q2, 1, renamable $vpr, undef renamable $q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block5.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block5.mir index c0c90903640e6..cf127f8e12761 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block5.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block5.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s --- | @@ -63,19 +64,22 @@ body: | bb.0.entry: liveins: $q0, $q1, $q2, $r0 - ; CHECK: BUNDLE {{.*}} { - ; CHECK-NEXT: MVE_VPST 4, implicit-def $p0 - ; CHECK-NEXT: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 - ; CHECK-NEXT: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, internal renamable $q3, 1, renamable $vpr, undef renamable $q1 - ; CHECK-NEXT: $q3 = MVE_VORR $q0, $q0, 0, $noreg, internal undef $q3 - ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE {{.*}} { - ; CHECK-NEXT: MVE_VPST 4, implicit-def $p0 - ; CHECK-NEXT: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 - ; CHECK-NEXT: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 - ; CHECK-NEXT: tBX_RET 14, $noreg, implicit internal $q0 - ; CHECK-NEXT: } - + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: liveins: $q0, $q1, $q2, $r0 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + ; CHECK: BUNDLE implicit-def $p0, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit killed $q1, implicit $q2, implicit $vpr, implicit killed $q3, implicit $q0 { + ; CHECK: MVE_VPST 4, implicit-def $p0 + ; CHECK: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + ; CHECK: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, internal renamable $q3, 1, renamable $vpr, undef renamable $q1 + ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, internal undef $q3 + ; CHECK: } + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q1, implicit killed $q2, implicit killed $vpr, implicit killed $q3, implicit killed $q0 { + ; CHECK: MVE_VPST 4, implicit-def $p0 + ; CHECK: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + ; CHECK: tBX_RET 14, $noreg, implicit internal $q0 + ; CHECK: } $vpr = VMSR_P0 killed $r0, 14, $noreg $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block6.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block6.mir index 7060cdca3e355..c42457a4b7687 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block6.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block6.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s --- | @@ -63,17 +64,20 @@ body: | bb.0.entry: liveins: $q0, $q1, $q2, $r0, $r1 - ; CHECK: BUNDLE {{.*}} { - ; CHECK-NEXT: MVE_VPST 8, implicit-def $p0 - ; CHECK-NEXT: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, killed renamable $vpr, killed renamable $q3 - ; CHECK-NEXT: $vpr = VMSR_P0 killed $r1, 14, $noreg - ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE {{.*}} { - ; CHECK-NEXT: MVE_VPST 8, implicit-def $p0 - ; CHECK-NEXT: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 - ; CHECK-NEXT: tBX_RET 14, $noreg, implicit internal $q0 - ; CHECK-NEXT: } - + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: liveins: $q0, $q1, $q2, $r0, $r1 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + ; CHECK: BUNDLE implicit-def $p0, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $vpr, implicit killed $q1, implicit $q2, implicit killed $vpr, implicit killed $q3, implicit killed $r1 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, killed renamable $vpr, killed renamable $q3 + ; CHECK: $vpr = VMSR_P0 killed $r1, 14, $noreg + ; CHECK: } + ; CHECK: BUNDLE implicit-def $p0, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q3, implicit killed $q2, implicit killed $vpr, implicit killed $q0 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + ; CHECK: tBX_RET 14, $noreg, implicit internal $q0 + ; CHECK: } $vpr = VMSR_P0 killed $r0, 14, $noreg $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, killed renamable $vpr, killed renamable $q3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block7.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block7.mir new file mode 100644 index 0000000000000..79e3d1e573779 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block7.mir @@ -0,0 +1,102 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-none-eabi" + + define hidden arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32_v2(<4 x float> %inactive1, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 { + entry: + %conv.i = zext i16 %p to i32 + %0 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %a, <4 x float> %b, i32 %conv.i) #2 + %1 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> undef, <4 x float> %0, <4 x float> %0, i32 %conv.i) #2 + br label %bb2 + bb2: + %2 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %1, <4 x float> %b, i32 %conv.i) #2 + %3 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %2, <4 x float> %b, i32 %conv.i) #2 + ret <4 x float> %3 + } + + declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1 + + attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { nounwind readnone } + attributes #2 = { nounwind } + +... +--- +name: test_vminnmq_m_f32_v2 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } + - { reg: '$r0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +constants: [] +body: | + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $q0, $q1, $q2, $r0 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit-def $vpr, implicit killed $q1, implicit $q2, implicit killed $vpr, implicit killed $q3 { + ; CHECK: MVE_VPST 4, implicit-def $p0 + ; CHECK: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + ; CHECK: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, internal renamable $q3, 1, renamable $vpr, undef renamable $q1 + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: bb.1.bb2: + ; CHECK: liveins: $q0, $q1, $q2, $q3, $vpr + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q1, implicit killed $q2, implicit killed $vpr, implicit killed $q3, implicit killed $q0 { + ; CHECK: MVE_VPST 4, implicit-def $p0 + ; CHECK: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + ; CHECK: tBX_RET 14, $noreg, implicit internal $q0 + ; CHECK: } + bb.0.entry: + liveins: $q0, $q1, $q2, $r0 + + $vpr = VMSR_P0 killed $r0, 14, $noreg + $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + renamable $q1 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, renamable $q3, 1, renamable $vpr, undef renamable $q1 + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + + bb.1.bb2: + liveins: $q0, $q1, $q2, $q3, $vpr + + renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + tBX_RET 14, $noreg, implicit $q0 + +... diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block8.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block8.mir new file mode 100644 index 0000000000000..4cd2f793bc7a9 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block8.mir @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-none-eabi" + + define hidden arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32_v2(<4 x float> %inactive1, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 { + entry: + %conv.i = zext i16 %p to i32 + %0 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %a, <4 x float> %b, i32 %conv.i) #2 + %1 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> undef, <4 x float> %0, <4 x float> %0, i32 %conv.i) #2 + %2 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %1, <4 x float> %b, i32 %conv.i) #2 + %3 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %2, <4 x float> %b, i32 %conv.i) #2 + ret <4 x float> %3 + } + + declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1 + + attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { nounwind readnone } + attributes #2 = { nounwind } + +... +--- +name: test_vminnmq_m_f32_v2 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } + - { reg: '$r0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +constants: [] +body: | + bb.0.entry: + liveins: $q0, $q1, $q2, $r0 + + ; CHECK-LABEL: name: test_vminnmq_m_f32_v2 + ; CHECK: liveins: $q0, $q1, $q2, $r0 + ; CHECK: $vpr = VMSR_P0 killed $r0, 14, $noreg + ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + ; CHECK: BUNDLE implicit-def $p0, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $vpr, implicit killed $q1, implicit $q2, implicit killed $vpr, implicit killed $q3 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: BUNDLE implicit-def $p0, implicit-def $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit killed $q3, implicit $vpr, implicit undef $q1, implicit $q0 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $q1 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, renamable $q3, 1, renamable $vpr, undef renamable $q1 + ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + ; CHECK: } + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q1, implicit killed $q2, implicit killed $vpr, implicit killed $q3, implicit killed $q0 { + ; CHECK: MVE_VPST 4, implicit-def $p0 + ; CHECK: renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + ; CHECK: renamable $q0 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + ; CHECK: tBX_RET 14, $noreg, implicit internal $q0 + ; CHECK: } + $vpr = VMSR_P0 killed $r0, 14, $noreg + $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $q1 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, renamable $q3, 1, renamable $vpr, undef renamable $q1 + $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3 + renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3 + renamable $q0 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + tBX_RET 14, $noreg, implicit $q0 + +... diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-nots.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-nots.mir new file mode 100644 index 0000000000000..26ef61fef2b6f --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-nots.mir @@ -0,0 +1,272 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1-m.main-none-eabi -mattr=+mve -run-pass arm-mve-vpt %s -o - | FileCheck %s + +--- | + + define arm_aapcs_vfpcc <4 x i32> @vpnot(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + entry: + %0 = icmp slt <4 x i32> %a, zeroinitializer + %c2 = icmp sgt <4 x i32> %b, zeroinitializer + %c3 = icmp eq <4 x i32> %c, zeroinitializer + %o1 = and <4 x i1> %0, %c2 + %o2 = xor <4 x i1> %o1, + %o = and <4 x i1> %c3, %o2 + %s = select <4 x i1> %o, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s + } + + define arm_aapcs_vfpcc <4 x i32> @vpnot_end(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + entry: + %0 = icmp slt <4 x i32> %a, zeroinitializer + %c2 = icmp sgt <4 x i32> %b, zeroinitializer + %c3 = icmp eq <4 x i32> %c, zeroinitializer + %o1 = and <4 x i1> %0, %c2 + %o2 = xor <4 x i1> %o1, + br label %bb2 + bb2: + %o = and <4 x i1> %c3, %o2 + %s = select <4 x i1> %o, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s + } + + define arm_aapcs_vfpcc <4 x i32> @vpnot_two(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + entry: + unreachable + } + define arm_aapcs_vfpcc <4 x i32> @vpnot_lots(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + entry: + unreachable + } + define arm_aapcs_vfpcc <4 x i32> @vpnot_first(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + entry: + unreachable + } + define arm_aapcs_vfpcc <4 x i32> @vpnot_many(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + entry: + unreachable + } + +... +--- +name: vpnot +alignment: 2 +tracksRegLiveness: true +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + ; CHECK-LABEL: name: vpnot + ; CHECK: liveins: $q0, $q1, $q2 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def $vpr, implicit $q1, implicit $zr, implicit killed $vpr { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VPNOT internal killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $vpr, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q2, implicit $zr, implicit killed $vpr, implicit killed $q0, implicit killed $q1 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, internal killed renamable $vpr + ; CHECK: } + ; CHECK: tBX_RET 14, $noreg, implicit $q0 + renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + tBX_RET 14, $noreg, implicit $q0 + +... +--- +name: vpnot_end +alignment: 2 +tracksRegLiveness: true +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } +body: | + ; CHECK-LABEL: name: vpnot_end + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $q0, $q1, $q2 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def $vpr, implicit $q1, implicit $zr, implicit killed $vpr { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VPNOT internal killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: bb.1.bb2: + ; CHECK: liveins: $q0, $q1, $q2, $vpr + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $vpr, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q2, implicit $zr, implicit killed $vpr, implicit killed $q0, implicit killed $q1 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, internal killed renamable $vpr + ; CHECK: } + ; CHECK: tBX_RET 14, $noreg, implicit $q0 + bb.0.entry: + liveins: $q0, $q1, $q2 + + renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + + bb.1.bb2: + liveins: $q0, $q1, $q2, $vpr + + renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + tBX_RET 14, $noreg, implicit $q0 + +... +--- +name: vpnot_two +alignment: 2 +tracksRegLiveness: true +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + ; CHECK-LABEL: name: vpnot_two + ; CHECK: liveins: $q0, $q1, $q2 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def $vpr, implicit $q1, implicit $zr, implicit killed $vpr { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VPNOT internal killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $vpr, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q2, implicit $zr, implicit killed $vpr, implicit killed $q0, implicit killed $q1 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, internal killed renamable $vpr + ; CHECK: } + ; CHECK: tBX_RET 14, $noreg, implicit $q0 + renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + tBX_RET 14, $noreg, implicit $q0 + +... +--- +name: vpnot_lots +alignment: 2 +tracksRegLiveness: true +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + ; CHECK-LABEL: name: vpnot_lots + ; CHECK: liveins: $q0, $q1, $q2 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def $vpr, implicit $q1, implicit $zr, implicit killed $vpr { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VPNOT internal killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $vpr, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q2, implicit $zr, implicit killed $vpr, implicit killed $q0, implicit killed $q1 { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, internal killed renamable $vpr + ; CHECK: } + ; CHECK: tBX_RET 14, $noreg, implicit $q0 + renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + tBX_RET 14, $noreg, implicit $q0 + +... +--- +name: vpnot_first +alignment: 2 +tracksRegLiveness: true +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + ; CHECK-LABEL: name: vpnot_first + ; CHECK: liveins: $q0, $q1, $q2 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def dead $vpr, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $q1, implicit $zr, implicit killed $vpr, implicit killed $q2, implicit killed $q0 { + ; CHECK: MVE_VPST 4, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, internal killed renamable $vpr + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, internal killed renamable $vpr + ; CHECK: } + ; CHECK: tBX_RET 14, $noreg, implicit $q0 + renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + tBX_RET 14, $noreg, implicit $q0 + +... +--- +name: vpnot_many +alignment: 2 +tracksRegLiveness: true +liveins: + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } + - { reg: '$q2', virtual-reg: '' } +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + ; CHECK-LABEL: name: vpnot_many + ; CHECK: liveins: $q0, $q1, $q2 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + ; CHECK: BUNDLE implicit-def $p0, implicit-def $vpr, implicit $q1, implicit $zr, implicit killed $vpr { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VPNOT internal killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: BUNDLE implicit-def $p0, implicit-def $vpr, implicit killed $q2, implicit $zr, implicit killed $vpr { + ; CHECK: MVE_VPST 8, implicit-def $p0 + ; CHECK: renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VPNOT internal killed renamable $vpr, 0, $noreg + ; CHECK: } + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: tBX_RET 14, $noreg, implicit $q0 + renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr + renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + tBX_RET 14, $noreg, implicit $q0 + +... diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index e3e80ef545786..100a014619add 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -1898,9 +1898,7 @@ define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) { ; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 ; AVX512-NEXT: vpextrd $2, %xmm2, %eax ; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vpextrd $3, %xmm0, %eax -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vpextrd $1, %xmm2, %eax ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: vpextrd $3, %xmm2, %eax diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 949af8c34df43..df2c2f55af45e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2821,67 +2821,28 @@ define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) { } define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { -; SSE2-LABEL: shuffle_extract_insert: -; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %r8d -; SSE2-NEXT: pextrw $5, %xmm0, %edx -; SSE2-NEXT: pextrw $6, %xmm0, %esi -; SSE2-NEXT: movd %xmm0, %edi -; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pinsrw $2, %edi, %xmm0 -; SSE2-NEXT: pinsrw $3, %eax, %xmm0 -; SSE2-NEXT: pinsrw $4, %esi, %xmm0 -; SSE2-NEXT: pinsrw $5, %edx, %xmm0 -; SSE2-NEXT: pinsrw $6, %r8d, %xmm0 -; SSE2-NEXT: pinsrw $7, %ecx, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_extract_insert: +; SSE: # %bb.0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_extract_insert: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %r8d -; SSSE3-NEXT: pextrw $5, %xmm0, %edx -; SSSE3-NEXT: pextrw $6, %xmm0, %esi -; SSSE3-NEXT: movd %xmm0, %edi -; SSSE3-NEXT: pextrw $7, %xmm0, %ecx -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 -; SSSE3-NEXT: pinsrw $3, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $4, %esi, %xmm0 -; SSSE3-NEXT: pinsrw $5, %edx, %xmm0 -; SSSE3-NEXT: pinsrw $6, %r8d, %xmm0 -; SSSE3-NEXT: pinsrw $7, %ecx, %xmm0 -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_extract_insert: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_extract_insert: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrw $4, %xmm0, %eax -; SSE41-NEXT: pextrw $6, %xmm0, %ecx -; SSE41-NEXT: movd %xmm0, %edx -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,2,3,4,5,6,7] -; SSE41-NEXT: pinsrw $2, %edx, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] -; SSE41-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] -; SSE41-NEXT: pinsrw $6, %eax, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_extract_insert: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_extract_insert: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: vpextrw $6, %xmm0, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] -; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_extract_insert: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] +; AVX2-FAST-NEXT: retq %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 %a3 = extractelement <8 x i16> %a, i32 3 @@ -2903,68 +2864,36 @@ define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_extract_insert_double: ; SSE2: # %bb.0: -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: pextrw $4, %xmm0, %r8d -; SSE2-NEXT: pextrw $6, %xmm0, %edx -; SSE2-NEXT: pextrw $3, %xmm1, %esi -; SSE2-NEXT: pextrw $5, %xmm1, %edi -; SSE2-NEXT: pextrw $7, %xmm1, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pinsrw $2, %eax, %xmm0 -; SSE2-NEXT: pinsrw $3, %esi, %xmm0 -; SSE2-NEXT: pinsrw $4, %edx, %xmm0 -; SSE2-NEXT: pinsrw $5, %edi, %xmm0 -; SSE2-NEXT: pinsrw $6, %r8d, %xmm0 -; SSE2-NEXT: pinsrw $7, %ecx, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_extract_insert_double: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: pextrw $4, %xmm0, %r8d -; SSSE3-NEXT: pextrw $6, %xmm0, %edx -; SSSE3-NEXT: pextrw $3, %xmm1, %esi -; SSSE3-NEXT: pextrw $5, %xmm1, %edi -; SSSE3-NEXT: pextrw $7, %xmm1, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,8,9,12,13,12,13,14,15] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: pinsrw $2, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $3, %esi, %xmm0 -; SSSE3-NEXT: pinsrw $4, %edx, %xmm0 -; SSSE3-NEXT: pinsrw $5, %edi, %xmm0 -; SSSE3-NEXT: pinsrw $6, %r8d, %xmm0 -; SSSE3-NEXT: pinsrw $7, %ecx, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_extract_insert_double: ; SSE41: # %bb.0: -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: pextrw $4, %xmm0, %ecx -; SSE41-NEXT: pextrw $6, %xmm0, %edx -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,8,9,12,13,12,13,14,15] ; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_extract_insert_double: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpextrw $4, %xmm0, %ecx -; AVX-NEXT: vpextrw $6, %xmm0, %edx -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,8,9,12,13,12,13,14,15] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX-NEXT: retq %a0 = extractelement <8 x i16> %a, i32 0 %a4 = extractelement <8 x i16> %a, i32 4 diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll index 56d4ee78f0826..118186f98a9e1 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll @@ -42,15 +42,9 @@ define i1 @n0(i32 %x, i64 %y, i32 %len) { ; New shift amount would be 16, %x has 16 leading zeros - can fold. define i1 @t1(i64 %y, i32 %len) { ; CHECK-LABEL: @t1( -; CHECK-NEXT: [[T0:%.*]] = sub i32 32, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 65535, [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -16 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 [[Y:%.*]], [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[Y:%.*]], 4294901760 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TMP2]] ; %t0 = sub i32 32, %len %t1 = shl i32 65535, %t0 @@ -65,15 +59,9 @@ define i1 @t1(i64 %y, i32 %len) { ; Note that we indeed look at leading zeros! define i1 @t1_single_bit(i64 %y, i32 %len) { ; CHECK-LABEL: @t1_single_bit( -; CHECK-NEXT: [[T0:%.*]] = sub i32 32, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 32768, [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -16 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 [[Y:%.*]], [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[Y:%.*]], 2147483648 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TMP2]] ; %t0 = sub i32 32, %len %t1 = shl i32 32768, %t0 @@ -112,15 +100,9 @@ define i1 @n2(i64 %y, i32 %len) { ; New shift amount would be 16, %y has 47 leading zeros - can fold. define i1 @t3(i32 %x, i32 %len) { ; CHECK-LABEL: @t3( -; CHECK-NEXT: [[T0:%.*]] = sub i32 32, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 [[X:%.*]], [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -16 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 131071, [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TMP2]] ; %t0 = sub i32 32, %len %t1 = shl i32 %x, %t0 @@ -135,15 +117,9 @@ define i1 @t3(i32 %x, i32 %len) { ; Note that we indeed look at leading zeros! define i1 @t3_singlebit(i32 %x, i32 %len) { ; CHECK-LABEL: @t3_singlebit( -; CHECK-NEXT: [[T0:%.*]] = sub i32 32, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 [[X:%.*]], [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -16 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 65536, [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TMP2]] ; %t0 = sub i32 32, %len %t1 = shl i32 %x, %t0 @@ -189,15 +165,10 @@ define i1 @n4(i32 %x, i32 %len) { ; New shift amount would be 16, minimal count of leading zeros in %x is 16. Ok. define <2 x i1> @t5_vec(<2 x i64> %y, <2 x i32> %len) { ; CHECK-LABEL: @t5_vec( -; CHECK-NEXT: [[T0:%.*]] = sub <2 x i32> , [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl <2 x i32> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <2 x i32> [[LEN]], -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext <2 x i32> [[T2]] to <2 x i64> -; CHECK-NEXT: [[T3:%.*]] = lshr <2 x i64> [[Y:%.*]], [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc <2 x i64> [[T3]] to <2 x i32> -; CHECK-NEXT: [[T4:%.*]] = and <2 x i32> [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne <2 x i32> [[T4]], zeroinitializer -; CHECK-NEXT: ret <2 x i1> [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[Y:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[TMP3]] ; %t0 = sub <2 x i32> , %len %t1 = shl <2 x i32> , %t0 @@ -236,15 +207,9 @@ define <2 x i1> @n6_vec(<2 x i64> %y, <2 x i32> %len) { ; New shift amount would be 16, minimal count of leading zeros in %x is 47. Ok. define <2 x i1> @t7_vec(<2 x i32> %x, <2 x i32> %len) { ; CHECK-LABEL: @t7_vec( -; CHECK-NEXT: [[T0:%.*]] = sub <2 x i32> , [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl <2 x i32> [[X:%.*]], [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <2 x i32> [[LEN]], -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext <2 x i32> [[T2]] to <2 x i64> -; CHECK-NEXT: [[T3:%.*]] = lshr <2 x i64> , [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc <2 x i64> [[T3]] to <2 x i32> -; CHECK-NEXT: [[T4:%.*]] = and <2 x i32> [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne <2 x i32> [[T4]], zeroinitializer -; CHECK-NEXT: ret <2 x i1> [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[TMP2]] ; %t0 = sub <2 x i32> , %len %t1 = shl <2 x i32> %x, %t0 @@ -285,15 +250,11 @@ define <2 x i1> @n8_vec(<2 x i32> %x, <2 x i32> %len) { ; Ok if the final shift amount is exactly one less than widest bit width. define i1 @t9_highest_bit(i32 %x, i64 %y, i32 %len) { ; CHECK-LABEL: @t9_highest_bit( -; CHECK-NEXT: [[T0:%.*]] = sub i32 64, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 [[X:%.*]], [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -1 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 [[Y:%.*]], [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[Y:%.*]], 63 +; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: ret i1 [[TMP4]] ; %t0 = sub i32 64, %len %t1 = shl i32 %x, %t0 @@ -332,15 +293,10 @@ define i1 @t10_almost_highest_bit(i32 %x, i64 %y, i32 %len) { ; Ok if the final shift amount is zero. define i1 @t11_no_shift(i32 %x, i64 %y, i32 %len) { ; CHECK-LABEL: @t11_no_shift( -; CHECK-NEXT: [[T0:%.*]] = sub i32 64, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 [[X:%.*]], [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -64 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 [[Y:%.*]], [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: ret i1 [[TMP3]] ; %t0 = sub i32 64, %len %t1 = shl i32 %x, %t0 @@ -427,15 +383,9 @@ define <2 x i1> @n12_bad(<2 x i32> %x, <2 x i64> %y, <2 x i32> %len) { ; Ok if one of the values being shifted is 1 define i1 @t13_x_is_one(i64 %y, i32 %len) { ; CHECK-LABEL: @t13_x_is_one( -; CHECK-NEXT: [[T0:%.*]] = sub i32 32, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 1, [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -16 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 [[Y:%.*]], [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[Y:%.*]], 65536 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TMP2]] ; %t0 = sub i32 32, %len %t1 = shl i32 1, %t0 @@ -449,15 +399,7 @@ define i1 @t13_x_is_one(i64 %y, i32 %len) { } define i1 @t14_x_is_one(i32 %x, i32 %len) { ; CHECK-LABEL: @t14_x_is_one( -; CHECK-NEXT: [[T0:%.*]] = sub i32 32, [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl i32 [[X:%.*]], [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add i32 [[LEN]], -16 -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext i32 [[T2]] to i64 -; CHECK-NEXT: [[T3:%.*]] = lshr i64 1, [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc i64 [[T3]] to i32 -; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne i32 [[T4]], 0 -; CHECK-NEXT: ret i1 [[T5]] +; CHECK-NEXT: ret i1 false ; %t0 = sub i32 32, %len %t1 = shl i32 %x, %t0 @@ -472,15 +414,10 @@ define i1 @t14_x_is_one(i32 %x, i32 %len) { define <2 x i1> @t15_vec_x_is_one_or_zero(<2 x i64> %y, <2 x i32> %len) { ; CHECK-LABEL: @t15_vec_x_is_one_or_zero( -; CHECK-NEXT: [[T0:%.*]] = sub <2 x i32> , [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl <2 x i32> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <2 x i32> [[LEN]], -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext <2 x i32> [[T2]] to <2 x i64> -; CHECK-NEXT: [[T3:%.*]] = lshr <2 x i64> [[Y:%.*]], [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc <2 x i64> [[T3]] to <2 x i32> -; CHECK-NEXT: [[T4:%.*]] = and <2 x i32> [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne <2 x i32> [[T4]], zeroinitializer -; CHECK-NEXT: ret <2 x i1> [[T5]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[Y:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[TMP3]] ; %t0 = sub <2 x i32> , %len %t1 = shl <2 x i32> , %t0 @@ -494,15 +431,7 @@ define <2 x i1> @t15_vec_x_is_one_or_zero(<2 x i64> %y, <2 x i32> %len) { } define <2 x i1> @t16_vec_y_is_one_or_zero(<2 x i32> %x, <2 x i32> %len) { ; CHECK-LABEL: @t16_vec_y_is_one_or_zero( -; CHECK-NEXT: [[T0:%.*]] = sub <2 x i32> , [[LEN:%.*]] -; CHECK-NEXT: [[T1:%.*]] = shl <2 x i32> [[X:%.*]], [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <2 x i32> [[LEN]], -; CHECK-NEXT: [[T2_WIDE:%.*]] = zext <2 x i32> [[T2]] to <2 x i64> -; CHECK-NEXT: [[T3:%.*]] = lshr <2 x i64> , [[T2_WIDE]] -; CHECK-NEXT: [[T3_TRUNC:%.*]] = trunc <2 x i64> [[T3]] to <2 x i32> -; CHECK-NEXT: [[T4:%.*]] = and <2 x i32> [[T1]], [[T3_TRUNC]] -; CHECK-NEXT: [[T5:%.*]] = icmp ne <2 x i32> [[T4]], zeroinitializer -; CHECK-NEXT: ret <2 x i1> [[T5]] +; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %t0 = sub <2 x i32> , %len %t1 = shl <2 x i32> %x, %t0 @@ -524,15 +453,8 @@ define <2 x i1> @t16_vec_y_is_one_or_zero(<2 x i32> %x, <2 x i32> %len) { ; And that's the main motivational pattern: define i1 @rawspeed_signbit(i64 %storage, i32 %nbits) { ; CHECK-LABEL: @rawspeed_signbit( -; CHECK-NEXT: [[SKIPNBITS:%.*]] = sub nsw i32 64, [[NBITS:%.*]] -; CHECK-NEXT: [[SKIPNBITSWIDE:%.*]] = zext i32 [[SKIPNBITS]] to i64 -; CHECK-NEXT: [[DATAWIDE:%.*]] = lshr i64 [[STORAGE:%.*]], [[SKIPNBITSWIDE]] -; CHECK-NEXT: [[DATA:%.*]] = trunc i64 [[DATAWIDE]] to i32 -; CHECK-NEXT: [[NBITSMINUSONE:%.*]] = add nsw i32 [[NBITS]], -1 -; CHECK-NEXT: [[BITMASK:%.*]] = shl i32 1, [[NBITSMINUSONE]] -; CHECK-NEXT: [[BITMASKED:%.*]] = and i32 [[BITMASK]], [[DATA]] -; CHECK-NEXT: [[ISBITUNSET:%.*]] = icmp eq i32 [[BITMASKED]], 0 -; CHECK-NEXT: ret i1 [[ISBITUNSET]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[STORAGE:%.*]], -1 +; CHECK-NEXT: ret i1 [[TMP1]] ; %skipnbits = sub nsw i32 64, %nbits %skipnbitswide = zext i32 %skipnbits to i64 diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll index fbad6d1d65190..0c3ee460948c0 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll @@ -6,8 +6,8 @@ ; we should move shifts to the same hand of 'and', i.e. e.g. rewrite as ; icmp eq/ne (and (((x shift Q) shift K), y)), 0 ; We are only interested in opposite logical shifts here. -; We still can handle the case where there is a truncation between a shift -; and an 'and', but for now only if it's 'shl' - simpler legality check. +; We still can handle the case where there is a truncation between a shift and +; an 'and'. If it's trunc-of-shl - no extra legality check is needed. ;------------------------------------------------------------------------------- ; Basic scalar tests