diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp index 460883b84391fd..c4ecc5286ba9e6 100644 --- a/clang-tools-extra/clangd/ClangdServer.cpp +++ b/clang-tools-extra/clangd/ClangdServer.cpp @@ -80,6 +80,11 @@ struct UpdateIndexCallbacks : public ParsingCallbacks { }); } + void onFailedAST(PathRef Path, std::vector Diags, + PublishFn Publish) override { + Publish([&]() { DiagConsumer.onDiagnosticsReady(Path, Diags); }); + } + void onFileUpdated(PathRef File, const TUStatus &Status) override { DiagConsumer.onFileUpdated(File, Status); } diff --git a/clang-tools-extra/clangd/ClangdUnit.cpp b/clang-tools-extra/clangd/ClangdUnit.cpp index f85cac200da38a..fd5202f3751737 100644 --- a/clang-tools-extra/clangd/ClangdUnit.cpp +++ b/clang-tools-extra/clangd/ClangdUnit.cpp @@ -292,7 +292,8 @@ void dumpAST(ParsedAST &AST, llvm::raw_ostream &OS) { } llvm::Optional -ParsedAST::build(std::unique_ptr CI, +ParsedAST::build(std::unique_ptr CI, + llvm::ArrayRef CompilerInvocationDiags, std::shared_ptr Preamble, std::unique_ptr Buffer, llvm::IntrusiveRefCntPtr VFS, @@ -459,10 +460,15 @@ ParsedAST::build(std::unique_ptr CI, // So just inform the preprocessor of EOF, while keeping everything alive. Clang->getPreprocessor().EndSourceFile(); - std::vector Diags = ASTDiags.take(CTContext.getPointer()); + std::vector Diags = CompilerInvocationDiags; // Add diagnostics from the preamble, if any. if (Preamble) - Diags.insert(Diags.begin(), Preamble->Diags.begin(), Preamble->Diags.end()); + Diags.insert(Diags.end(), Preamble->Diags.begin(), Preamble->Diags.end()); + // Finally, add diagnostics coming from the AST. + { + std::vector D = ASTDiags.take(CTContext.getPointer()); + Diags.insert(Diags.end(), D.begin(), D.end()); + } return ParsedAST(std::move(Preamble), std::move(Clang), std::move(Action), std::move(Tokens), std::move(ParsedDecls), std::move(Diags), std::move(Includes), std::move(CanonIncludes)); @@ -646,6 +652,7 @@ buildPreamble(PathRef FileName, CompilerInvocation &CI, llvm::Optional buildAST(PathRef FileName, std::unique_ptr Invocation, + llvm::ArrayRef CompilerInvocationDiags, const ParseInputs &Inputs, std::shared_ptr Preamble) { trace::Span Tracer("BuildAST"); @@ -661,7 +668,8 @@ buildAST(PathRef FileName, std::unique_ptr Invocation, } return ParsedAST::build( - std::make_unique(*Invocation), Preamble, + std::make_unique(*Invocation), + CompilerInvocationDiags, Preamble, llvm::MemoryBuffer::getMemBufferCopy(Inputs.Contents, FileName), std::move(VFS), Inputs.Index, Inputs.Opts); } diff --git a/clang-tools-extra/clangd/ClangdUnit.h b/clang-tools-extra/clangd/ClangdUnit.h index f5b18f97387f9d..3af34b019648de 100644 --- a/clang-tools-extra/clangd/ClangdUnit.h +++ b/clang-tools-extra/clangd/ClangdUnit.h @@ -25,6 +25,7 @@ #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Core/Replacement.h" #include "clang/Tooling/Syntax/Tokens.h" +#include "llvm/ADT/ArrayRef.h" #include #include #include @@ -76,10 +77,11 @@ class ParsedAST { /// it is reused during parsing. static llvm::Optional build(std::unique_ptr CI, + llvm::ArrayRef CompilerInvocationDiags, std::shared_ptr Preamble, std::unique_ptr Buffer, - IntrusiveRefCntPtr VFS, const SymbolIndex *Index, - const ParseOptions &Opts); + llvm::IntrusiveRefCntPtr VFS, + const SymbolIndex *Index, const ParseOptions &Opts); ParsedAST(ParsedAST &&Other); ParsedAST &operator=(ParsedAST &&Other); @@ -174,6 +176,7 @@ buildPreamble(PathRef FileName, CompilerInvocation &CI, /// result of calling buildPreamble. llvm::Optional buildAST(PathRef FileName, std::unique_ptr Invocation, + llvm::ArrayRef CompilerInvocationDiags, const ParseInputs &Inputs, std::shared_ptr Preamble); diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index b5304dbffe74a6..045320fc543f34 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1053,7 +1053,9 @@ bool semaCodeComplete(std::unique_ptr Consumer, ParseInput.FS = VFS; ParseInput.Contents = Input.Contents; ParseInput.Opts = ParseOptions(); - auto CI = buildCompilerInvocation(ParseInput); + + IgnoreDiagnostics IgnoreDiags; + auto CI = buildCompilerInvocation(ParseInput, IgnoreDiags); if (!CI) { elog("Couldn't create CompilerInvocation"); return false; @@ -1084,12 +1086,11 @@ bool semaCodeComplete(std::unique_ptr Consumer, bool CompletingInPreamble = PreambleRegion.Size > Input.Offset; // NOTE: we must call BeginSourceFile after prepareCompilerInstance. Otherwise // the remapped buffers do not get freed. - IgnoreDiagnostics DummyDiagsConsumer; auto Clang = prepareCompilerInstance( std::move(CI), (Input.Preamble && !CompletingInPreamble) ? &Input.Preamble->Preamble : nullptr, - std::move(ContentsBuffer), std::move(VFS), DummyDiagsConsumer); + std::move(ContentsBuffer), std::move(VFS), IgnoreDiags); Clang->getPreprocessorOpts().SingleFileParseMode = CompletingInPreamble; Clang->setCodeCompletionConsumer(Consumer.release()); diff --git a/clang-tools-extra/clangd/Compiler.cpp b/clang-tools-extra/clangd/Compiler.cpp index 7080e20e879e76..e0801433319076 100644 --- a/clang-tools-extra/clangd/Compiler.cpp +++ b/clang-tools-extra/clangd/Compiler.cpp @@ -41,7 +41,8 @@ void IgnoreDiagnostics::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, } std::unique_ptr -buildCompilerInvocation(const ParseInputs &Inputs) { +buildCompilerInvocation(const ParseInputs &Inputs, + clang::DiagnosticConsumer &D) { std::vector ArgStrs; for (const auto &S : Inputs.CompileCommand.CommandLine) ArgStrs.push_back(S.c_str()); @@ -52,12 +53,8 @@ buildCompilerInvocation(const ParseInputs &Inputs) { // dirs. } - // FIXME(ibiryukov): store diagnostics from CommandLine when we start - // reporting them. - IgnoreDiagnostics IgnoreDiagnostics; llvm::IntrusiveRefCntPtr CommandLineDiagsEngine = - CompilerInstance::createDiagnostics(new DiagnosticOptions, - &IgnoreDiagnostics, false); + CompilerInstance::createDiagnostics(new DiagnosticOptions, &D, false); std::unique_ptr CI = createInvocationFromCommandLine( ArgStrs, CommandLineDiagsEngine, Inputs.FS, /*ShouldRecoverOnErrors=*/true); diff --git a/clang-tools-extra/clangd/Compiler.h b/clang-tools-extra/clangd/Compiler.h index c24ea3546c5c46..689514ab4801c8 100644 --- a/clang-tools-extra/clangd/Compiler.h +++ b/clang-tools-extra/clangd/Compiler.h @@ -52,7 +52,8 @@ struct ParseInputs { /// Builds compiler invocation that could be used to build AST or preamble. std::unique_ptr -buildCompilerInvocation(const ParseInputs &Inputs); +buildCompilerInvocation(const ParseInputs &Inputs, + clang::DiagnosticConsumer &D); /// Creates a compiler instance, configured so that: /// - Contents of the parsed file are remapped to \p MainFile. diff --git a/clang-tools-extra/clangd/Diagnostics.cpp b/clang-tools-extra/clangd/Diagnostics.cpp index 7f1ab06db9d1d3..c9e1ed6bc6872c 100644 --- a/clang-tools-extra/clangd/Diagnostics.cpp +++ b/clang-tools-extra/clangd/Diagnostics.cpp @@ -16,11 +16,13 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticIDs.h" #include "clang/Basic/FileManager.h" +#include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Token.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/Capacity.h" @@ -393,6 +395,9 @@ int getSeverity(DiagnosticsEngine::Level L) { } std::vector StoreDiags::take(const clang::tidy::ClangTidyContext *Tidy) { + // Do not forget to emit a pending diagnostic if there is one. + flushLastDiag(); + // Fill in name/source now that we have all the context needed to map them. for (auto &Diag : Output) { if (const char *ClangDiag = getDiagnosticCode(Diag.ID)) { @@ -448,7 +453,6 @@ void StoreDiags::BeginSourceFile(const LangOptions &Opts, } void StoreDiags::EndSourceFile() { - flushLastDiag(); LangOpts = None; } @@ -467,10 +471,46 @@ static void writeCodeToFixMessage(llvm::raw_ostream &OS, llvm::StringRef Code) { OS << "…"; } +/// Fills \p D with all information, except the location-related bits. +/// Also note that ID and Name are not part of clangd::DiagBase and should be +/// set elsewhere. +static void fillNonLocationData(DiagnosticsEngine::Level DiagLevel, + const clang::Diagnostic &Info, + clangd::DiagBase &D) { + llvm::SmallString<64> Message; + Info.FormatDiagnostic(Message); + + D.Message = Message.str(); + D.Severity = DiagLevel; + D.Category = DiagnosticIDs::getCategoryNameFromID( + DiagnosticIDs::getCategoryNumberForDiag(Info.getID())) + .str(); +} + void StoreDiags::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, const clang::Diagnostic &Info) { DiagnosticConsumer::HandleDiagnostic(DiagLevel, Info); + if (Info.getLocation().isInvalid()) { + // Handle diagnostics coming from command-line arguments. The source manager + // is *not* available at this point, so we cannot use it. + if (DiagLevel < DiagnosticsEngine::Level::Error) { + IgnoreDiagnostics::log(DiagLevel, Info); + return; // non-errors add too much noise, do not show them. + } + + flushLastDiag(); + + LastDiag = Diag(); + LastDiag->ID = Info.getID(); + fillNonLocationData(DiagLevel, Info, *LastDiag); + LastDiag->InsideMainFile = true; + // Put it at the start of the main file, for a lack of a better place. + LastDiag->Range.start = Position{0, 0}; + LastDiag->Range.end = Position{0, 0}; + return; + } + if (!LangOpts || !Info.hasSourceManager()) { IgnoreDiagnostics::log(DiagLevel, Info); return; @@ -480,18 +520,13 @@ void StoreDiags::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, SourceManager &SM = Info.getSourceManager(); auto FillDiagBase = [&](DiagBase &D) { - D.Range = diagnosticRange(Info, *LangOpts); - llvm::SmallString<64> Message; - Info.FormatDiagnostic(Message); - D.Message = Message.str(); + fillNonLocationData(DiagLevel, Info, D); + D.InsideMainFile = InsideMainFile; + D.Range = diagnosticRange(Info, *LangOpts); D.File = SM.getFilename(Info.getLocation()); D.AbsFile = getCanonicalPath( SM.getFileEntryForID(SM.getFileID(Info.getLocation())), SM); - D.Severity = DiagLevel; - D.Category = DiagnosticIDs::getCategoryNameFromID( - DiagnosticIDs::getCategoryNumberForDiag(Info.getID())) - .str(); return D; }; @@ -564,7 +599,6 @@ void StoreDiags::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, LastDiag = Diag(); LastDiag->ID = Info.getID(); FillDiagBase(*LastDiag); - LastDiagWasAdjusted = false; if (!InsideMainFile) LastDiagWasAdjusted = adjustDiagFromHeader(*LastDiag, Info, *LangOpts); @@ -617,6 +651,7 @@ void StoreDiags::flushLastDiag() { vlog("Dropped diagnostic: {0}: {1}", LastDiag->File, LastDiag->Message); } LastDiag.reset(); + LastDiagWasAdjusted = false; } } // namespace clangd diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp index a09bf3f6a43d77..7052feceb35b9e 100644 --- a/clang-tools-extra/clangd/TUScheduler.cpp +++ b/clang-tools-extra/clangd/TUScheduler.cpp @@ -44,6 +44,7 @@ #include "TUScheduler.h" #include "Cancellation.h" #include "Compiler.h" +#include "Diagnostics.h" #include "GlobalCompilationDatabase.h" #include "Logger.h" #include "Trace.h" @@ -365,6 +366,14 @@ ASTWorker::~ASTWorker() { void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) { llvm::StringRef TaskName = "Update"; auto Task = [=]() mutable { + auto RunPublish = [&](llvm::function_ref Publish) { + // Ensure we only publish results from the worker if the file was not + // removed, making sure there are not race conditions. + std::lock_guard Lock(PublishMu); + if (CanPublishResults) + Publish(); + }; + // Get the actual command as `Inputs` does not have a command. // FIXME: some build systems like Bazel will take time to preparing // environment to build the file, it would be nice if we could emit a @@ -394,8 +403,11 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) { Inputs.CompileCommand.Directory, llvm::join(Inputs.CompileCommand.CommandLine, " ")); // Rebuild the preamble and the AST. + StoreDiags CompilerInvocationDiagConsumer; std::unique_ptr Invocation = - buildCompilerInvocation(Inputs); + buildCompilerInvocation(Inputs, CompilerInvocationDiagConsumer); + std::vector CompilerInvocationDiags = + CompilerInvocationDiagConsumer.take(); if (!Invocation) { elog("Could not build CompilerInvocation for file {0}", FileName); // Remove the old AST if it's still in cache. @@ -403,6 +415,9 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) { TUStatus::BuildDetails Details; Details.BuildFailed = true; emitTUStatus({TUAction::BuildingPreamble, TaskName}, &Details); + // Report the diagnostics we collected when parsing the command line. + Callbacks.onFailedAST(FileName, std::move(CompilerInvocationDiags), + RunPublish); // Make sure anyone waiting for the preamble gets notified it could not // be built. PreambleWasBuilt.notify(); @@ -468,7 +483,8 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) { llvm::Optional> AST = IdleASTs.take(this); if (!AST) { llvm::Optional NewAST = - buildAST(FileName, std::move(Invocation), Inputs, NewPreamble); + buildAST(FileName, std::move(Invocation), CompilerInvocationDiags, + Inputs, NewPreamble); AST = NewAST ? std::make_unique(std::move(*NewAST)) : nullptr; if (!(*AST)) { // buildAST fails. TUStatus::BuildDetails Details; @@ -481,22 +497,22 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) { Details.ReuseAST = true; emitTUStatus({TUAction::BuildingFile, TaskName}, &Details); } + // We want to report the diagnostics even if this update was cancelled. // It seems more useful than making the clients wait indefinitely if they // spam us with updates. // Note *AST can still be null if buildAST fails. if (*AST) { trace::Span Span("Running main AST callback"); - auto RunPublish = [&](llvm::function_ref Publish) { - // Ensure we only publish results from the worker if the file was not - // removed, making sure there are not race conditions. - std::lock_guard Lock(PublishMu); - if (CanPublishResults) - Publish(); - }; Callbacks.onMainAST(FileName, **AST, RunPublish); RanASTCallback = true; + } else { + // Failed to build the AST, at least report diagnostics from the command + // line if there were any. + // FIXME: we might have got more errors while trying to build the AST, + // surface them too. + Callbacks.onFailedAST(FileName, CompilerInvocationDiags, RunPublish); } // Stash the AST in the cache for further use. IdleASTs.put(this, std::move(*AST)); @@ -513,14 +529,16 @@ void ASTWorker::runWithAST( llvm::Optional> AST = IdleASTs.take(this); auto CurrentInputs = getCurrentFileInputs(); if (!AST) { - std::unique_ptr Invocation = - buildCompilerInvocation(*CurrentInputs); + StoreDiags CompilerInvocationDiagConsumer; + std::unique_ptr Invocation = buildCompilerInvocation( + *CurrentInputs, CompilerInvocationDiagConsumer); // Try rebuilding the AST. llvm::Optional NewAST = Invocation ? buildAST(FileName, std::make_unique(*Invocation), - *CurrentInputs, getPossiblyStalePreamble()) + CompilerInvocationDiagConsumer.take(), *CurrentInputs, + getPossiblyStalePreamble()) : None; AST = NewAST ? std::make_unique(std::move(*NewAST)) : nullptr; } diff --git a/clang-tools-extra/clangd/TUScheduler.h b/clang-tools-extra/clangd/TUScheduler.h index d6f530a751d428..e02250d6e6f7ac 100644 --- a/clang-tools-extra/clangd/TUScheduler.h +++ b/clang-tools-extra/clangd/TUScheduler.h @@ -10,8 +10,10 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TUSCHEDULER_H #include "ClangdUnit.h" +#include "Diagnostics.h" #include "Function.h" #include "GlobalCompilationDatabase.h" +#include "Path.h" #include "Threading.h" #include "index/CanonicalIncludes.h" #include "llvm/ADT/Optional.h" @@ -125,6 +127,11 @@ class ParsingCallbacks { /// Publish() may never run in this case). virtual void onMainAST(PathRef Path, ParsedAST &AST, PublishFn Publish) {} + /// Called whenever the AST fails to build. \p Diags will have the diagnostics + /// that led to failure. + virtual void onFailedAST(PathRef Path, std::vector Diags, + PublishFn Publish) {} + /// Called whenever the TU status is updated. virtual void onFileUpdated(PathRef File, const TUStatus &Status) {} }; diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp index 6d2360e62ad002..b58236ef7d8a7c 100644 --- a/clang-tools-extra/clangd/index/Background.cpp +++ b/clang-tools-extra/clangd/index/Background.cpp @@ -369,11 +369,11 @@ llvm::Error BackgroundIndex::index(tooling::CompileCommand Cmd) { Inputs.FS = std::move(FS); Inputs.FS->setCurrentWorkingDirectory(Cmd.Directory); Inputs.CompileCommand = std::move(Cmd); - auto CI = buildCompilerInvocation(Inputs); + IgnoreDiagnostics IgnoreDiags; + auto CI = buildCompilerInvocation(Inputs, IgnoreDiags); if (!CI) return llvm::createStringError(llvm::inconvertibleErrorCode(), "Couldn't build compiler invocation"); - IgnoreDiagnostics IgnoreDiags; auto Clang = prepareCompilerInstance(std::move(CI), /*Preamble=*/nullptr, std::move(*Buf), Inputs.FS, IgnoreDiags); if (!Clang) diff --git a/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp b/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp index 7fe57025dc7704..430a056c1ea1bd 100644 --- a/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp +++ b/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp @@ -10,6 +10,7 @@ #include "Annotations.h" #include "ClangdUnit.h" #include "Compiler.h" +#include "Diagnostics.h" #include "SourceCode.h" #include "TestFS.h" #include "TestTU.h" @@ -252,12 +253,13 @@ TEST(ClangdUnitTest, CanBuildInvocationWithUnknownArgs) { Inputs.FS = buildTestFS({{testPath("foo.cpp"), "void test() {}"}}); Inputs.CompileCommand.CommandLine = {"clang", "-fsome-unknown-flag", testPath("foo.cpp")}; - EXPECT_NE(buildCompilerInvocation(Inputs), nullptr); + IgnoreDiagnostics IgnoreDiags; + EXPECT_NE(buildCompilerInvocation(Inputs, IgnoreDiags), nullptr); // Unknown forwarded to -cc1 should not a failure either. Inputs.CompileCommand.CommandLine = { "clang", "-Xclang", "-fsome-unknown-flag", testPath("foo.cpp")}; - EXPECT_NE(buildCompilerInvocation(Inputs), nullptr); + EXPECT_NE(buildCompilerInvocation(Inputs, IgnoreDiags), nullptr); } } // namespace diff --git a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp index f1f304f935e0c7..f7b5ecafb8adc1 100644 --- a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp +++ b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp @@ -9,6 +9,7 @@ #include "AST.h" #include "Annotations.h" #include "ClangdUnit.h" +#include "Compiler.h" #include "SyncAPI.h" #include "TestFS.h" #include "TestTU.h" @@ -280,7 +281,8 @@ TEST(FileIndexTest, RebuildWithPreamble) { )cpp"; // Rebuild the file. - auto CI = buildCompilerInvocation(PI); + IgnoreDiagnostics IgnoreDiags; + auto CI = buildCompilerInvocation(PI, IgnoreDiags); FileIndex Index; bool IndexUpdated = false; diff --git a/clang-tools-extra/clangd/unittests/HeadersTests.cpp b/clang-tools-extra/clangd/unittests/HeadersTests.cpp index da701309e35cdb..d07312ca5884fe 100644 --- a/clang-tools-extra/clangd/unittests/HeadersTests.cpp +++ b/clang-tools-extra/clangd/unittests/HeadersTests.cpp @@ -46,7 +46,7 @@ class HeadersTest : public ::testing::Test { ParseInputs PI; PI.CompileCommand = *Cmd; PI.FS = VFS; - auto CI = buildCompilerInvocation(PI); + auto CI = buildCompilerInvocation(PI, IgnoreDiags); EXPECT_TRUE(static_cast(CI)); // The diagnostic options must be set before creating a CompilerInstance. CI->getDiagnosticOpts().IgnoreWarnings = true; diff --git a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp index b605c940360f5d..274c07f99cd701 100644 --- a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp +++ b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp @@ -14,6 +14,9 @@ #include "Path.h" #include "TUScheduler.h" #include "TestFS.h" +#include "Threading.h" +#include "clang/Basic/DiagnosticDriver.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "gmock/gmock.h" @@ -28,6 +31,9 @@ namespace { using ::testing::AnyOf; using ::testing::Each; using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Field; +using ::testing::IsEmpty; using ::testing::Pointee; using ::testing::UnorderedElementsAre; @@ -60,12 +66,22 @@ class TUSchedulerTests : public ::testing::Test { /// in updateWithDiags. static std::unique_ptr captureDiags() { class CaptureDiags : public ParsingCallbacks { + public: void onMainAST(PathRef File, ParsedAST &AST, PublishFn Publish) override { - auto Diags = AST.getDiagnostics(); + reportDiagnostics(File, AST.getDiagnostics(), Publish); + } + + void onFailedAST(PathRef File, std::vector Diags, + PublishFn Publish) override { + reportDiagnostics(File, Diags, Publish); + } + + private: + void reportDiagnostics(PathRef File, llvm::ArrayRef Diags, + PublishFn Publish) { auto D = Context::current().get(DiagsCallbackKey); if (!D) return; - Publish([&]() { const_cast< llvm::unique_function)> &> (*D)( @@ -720,6 +736,53 @@ TEST_F(TUSchedulerTests, TUStatus) { TUState(TUAction::Idle, /*No action*/ ""))); } +TEST_F(TUSchedulerTests, CommandLineErrors) { + // We should see errors from command-line parsing inside the main file. + CDB.ExtraClangFlags = {"-fsome-unknown-flag"}; + + TUScheduler S(CDB, /*AsyncThreadsCount=*/getDefaultAsyncThreadsCount(), + /*StorePreambleInMemory=*/true, /*ASTCallbacks=*/captureDiags(), + /*UpdateDebounce=*/std::chrono::steady_clock::duration::zero(), + ASTRetentionPolicy()); + + Notification Ready; + std::vector Diagnostics; + updateWithDiags(S, testPath("foo.cpp"), "void test() {}", + WantDiagnostics::Yes, [&](std::vector D) { + Diagnostics = std::move(D); + Ready.notify(); + }); + Ready.wait(); + + EXPECT_THAT( + Diagnostics, + ElementsAre(AllOf( + Field(&Diag::ID, Eq(diag::err_drv_unknown_argument)), + Field(&Diag::Name, Eq("drv_unknown_argument")), + Field(&Diag::Message, "unknown argument: '-fsome-unknown-flag'")))); +} + +TEST_F(TUSchedulerTests, CommandLineWarnings) { + // We should not see warnings from command-line parsing. + CDB.ExtraClangFlags = {"-Wsome-unknown-warning"}; + + TUScheduler S(CDB, /*AsyncThreadsCount=*/getDefaultAsyncThreadsCount(), + /*StorePreambleInMemory=*/true, /*ASTCallbacks=*/captureDiags(), + /*UpdateDebounce=*/std::chrono::steady_clock::duration::zero(), + ASTRetentionPolicy()); + + Notification Ready; + std::vector Diagnostics; + updateWithDiags(S, testPath("foo.cpp"), "void test() {}", + WantDiagnostics::Yes, [&](std::vector D) { + Diagnostics = std::move(D); + Ready.notify(); + }); + Ready.wait(); + + EXPECT_THAT(Diagnostics, IsEmpty()); +} + } // namespace } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/unittests/TestTU.cpp b/clang-tools-extra/clangd/unittests/TestTU.cpp index 0c1727eccad6cf..75393f1415b17f 100644 --- a/clang-tools-extra/clangd/unittests/TestTU.cpp +++ b/clang-tools-extra/clangd/unittests/TestTU.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "TestTU.h" +#include "Compiler.h" +#include "Diagnostics.h" #include "TestFS.h" #include "index/FileIndex.h" #include "index/MemIndex.h" @@ -59,14 +61,16 @@ ParsedAST TestTU::build() const { Inputs.Index = ExternalIndex; if (Inputs.Index) Inputs.Opts.SuggestMissingIncludes = true; - auto CI = buildCompilerInvocation(Inputs); + StoreDiags Diags; + auto CI = buildCompilerInvocation(Inputs, Diags); assert(CI && "Failed to build compilation invocation."); auto Preamble = buildPreamble(FullFilename, *CI, /*OldPreamble=*/nullptr, /*OldCompileCommand=*/Inputs.CompileCommand, Inputs, /*StoreInMemory=*/true, /*PreambleCallback=*/nullptr); - auto AST = buildAST(FullFilename, std::move(CI), Inputs, Preamble); + auto AST = + buildAST(FullFilename, std::move(CI), Diags.take(), Inputs, Preamble); if (!AST.hasValue()) { ADD_FAILURE() << "Failed to build code:\n" << Code; llvm_unreachable("Failed to build TestTU!"); diff --git a/clang/.gitattributes b/clang/.gitattributes index 1f6a5a1132903e..b48a3e3911adba 100644 --- a/clang/.gitattributes +++ b/clang/.gitattributes @@ -1,4 +1,3 @@ # Windows line ending tests test/Lexer/minimize_source_to_dependency_directives_invalid_error.c text eol=crlf test/FixIt/fixit-newline-style.c text eol=crlf -test/Frontend/system-header-line-directive-ms-lineendings.c text eol=crlf diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c b/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c deleted file mode 100644 index c4a4cf3d97526e..00000000000000 --- a/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c +++ /dev/null @@ -1,16 +0,0 @@ -// Test CF+LF are properly handled along with quoted, multi-line #error -// RUN: %clang_cc1 -DOTHER -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s - -#ifndef TEST -#error "message \ - more message \ - even more" -#endif - -#ifdef OTHER -#include -#endif - -// CHECK: #ifdef OTHER -// CHECK-NEXT: #include -// CHECK-NEXT: #endif diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 3a372f5736c640..2bd9dc2d34166d 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -3506,23 +3506,6 @@ bool PPC64LongBranchTargetSection::isNeeded() const { return !finalized || !entries.empty(); } -RISCVSdataSection::RISCVSdataSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 1, ".sdata") {} - -bool RISCVSdataSection::isNeeded() const { - if (!ElfSym::riscvGlobalPointer) - return false; - - // __global_pointer$ is defined relative to .sdata . If the section does not - // exist, create a dummy one. - for (BaseCommand *base : getParent()->sectionCommands) - if (auto *isd = dyn_cast(base)) - for (InputSection *isec : isd->sections) - if (isec != this) - return false; - return true; -} - static uint8_t getAbiVersion() { // MIPS non-PIC executable gets ABI version 1. if (config->emachine == EM_MIPS) { diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index 70ec36c4420d80..6846397895066e 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -1100,15 +1100,6 @@ class PartitionIndexSection : public SyntheticSection { void writeTo(uint8_t *buf) override; }; -// Create a dummy .sdata for __global_pointer$ if .sdata does not exist. -class RISCVSdataSection final : public SyntheticSection { -public: - RISCVSdataSection(); - size_t getSize() const override { return 0; } - bool isNeeded() const override; - void writeTo(uint8_t *buf) override {} -}; - InputSection *createInterpSection(); MergeInputSection *createCommentSection(); template void splitSections(); @@ -1173,7 +1164,6 @@ struct InStruct { PltSection *plt; PltSection *iplt; PPC32Got2Section *ppc32Got2; - RISCVSdataSection *riscvSdata; RelocationBaseSection *relaPlt; RelocationBaseSection *relaIplt; StringTableSection *shStrTab; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 1be73d6011f94d..4de22eff38e9ae 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -474,11 +474,6 @@ template static void createSyntheticSections() { add(in.ppc64LongBranchTarget); } - if (config->emachine == EM_RISCV) { - in.riscvSdata = make(); - add(in.riscvSdata); - } - in.gotPlt = make(); add(in.gotPlt); in.igotPlt = make(); @@ -1701,12 +1696,16 @@ template void Writer::finalizeSections() { // Define __rel[a]_iplt_{start,end} symbols if needed. addRelIpltSymbols(); - // RISC-V's gp can address +/- 2 KiB, set it to .sdata + 0x800 if not defined. - // This symbol should only be defined in an executable. - if (config->emachine == EM_RISCV && !config->shared) + // RISC-V's gp can address +/- 2 KiB, set it to .sdata + 0x800. This symbol + // should only be defined in an executable. If .sdata does not exist, its + // value/section does not matter but it has to be relative, so set its + // st_shndx arbitrarily to 1 (Out::elfHeader). + if (config->emachine == EM_RISCV && !config->shared) { + OutputSection *sec = findSection(".sdata"); ElfSym::riscvGlobalPointer = - addOptionalRegular("__global_pointer$", findSection(".sdata"), 0x800, - STV_DEFAULT, STB_GLOBAL); + addOptionalRegular("__global_pointer$", sec ? sec : Out::elfHeader, + 0x800, STV_DEFAULT, STB_GLOBAL); + } if (config->emachine == EM_X86_64) { // On targets that support TLSDESC, _TLS_MODULE_BASE_ is defined in such a @@ -1881,7 +1880,6 @@ template void Writer::finalizeSections() { finalizeSynthetic(in.plt); finalizeSynthetic(in.iplt); finalizeSynthetic(in.ppc32Got2); - finalizeSynthetic(in.riscvSdata); finalizeSynthetic(in.partIndex); // Dynamic section must be the last one in this list and dynamic @@ -2226,10 +2224,9 @@ template void Writer::fixSectionAlignments() { // and non-executable segments. // // TODO Enable this technique on all targets. - bool enable = config->emachine == EM_386 || - config->emachine == EM_AARCH64 || - config->emachine == EM_ARM || config->emachine == EM_PPC || - config->emachine == EM_PPC64; + bool enable = + config->emachine != EM_HEXAGON && config->emachine != EM_MIPS && + config->emachine != EM_RISCV && config->emachine != EM_X86_64; if (!enable || (config->zSeparateCode && prev && (prev->p_flags & PF_X) != (p->p_flags & PF_X))) diff --git a/lld/test/ELF/amdgpu-relocs.s b/lld/test/ELF/amdgpu-relocs.s index 88b5c5ec270fc1..caee617cf85e78 100644 --- a/lld/test/ELF/amdgpu-relocs.s +++ b/lld/test/ELF/amdgpu-relocs.s @@ -94,7 +94,7 @@ foo: # linker. # CHECK: Relocations [ # CHECK: .rela.dyn { -# CHECK-NEXT: R_AMDGPU_RELATIVE64 - 0x3008 +# CHECK-NEXT: R_AMDGPU_RELATIVE64 - 0x3928 # CHECK-NEXT: R_AMDGPU_ABS64 common_var0 0x0 # CHECK-NEXT: R_AMDGPU_ABS64 common_var1 0x0 # CHECK-NEXT: R_AMDGPU_ABS64 common_var2 0x0 @@ -114,16 +114,16 @@ foo: # CHECK-NEXT: } # CHECK-NEXT: ] -# NM: 0000000000003010 B common_var0 -# NM: 0000000000003410 B common_var1 -# NM: 0000000000003810 B common_var2 -# NM: 0000000000003008 d temp2 +# NM: 0000000000003930 B common_var0 +# NM: 0000000000003d30 B common_var1 +# NM: 0000000000004130 B common_var2 +# NM: 0000000000003928 d temp2 -# temp2 - foo = 0x3008-0x768 = 0x28a0 +# temp2 - foo = 0x3928-0x768 = 0x31c0 # HEX: section '.rodata': -# HEX-NEXT: 0x00000768 a0280000 00000000 +# HEX-NEXT: 0x00000768 c0310000 00000000 # common_var2+4, common_var1+8, and common_var0+12. # HEX: section 'nonalloc': -# HEX-NEXT: 0x00000000 00000000 14380000 00000000 18340000 -# HEX-NEXT: 0x00000010 00000000 1c300000 +# HEX-NEXT: 0x00000000 00000000 34410000 00000000 383d0000 +# HEX-NEXT: 0x00000010 00000000 3c390000 diff --git a/lld/test/ELF/basic-sparcv9.s b/lld/test/ELF/basic-sparcv9.s index 031ce7b1e8fdfc..820dba556f5a12 100644 --- a/lld/test/ELF/basic-sparcv9.s +++ b/lld/test/ELF/basic-sparcv9.s @@ -26,7 +26,7 @@ _start: # CHECK-NEXT: Version: 1 # CHECK-NEXT: Entry: [[ENTRY:0x[0-9A-F]+]] # CHECK-NEXT: ProgramHeaderOffset: 0x40 -# CHECK-NEXT: SectionHeaderOffset: 0x100080 +# CHECK-NEXT: SectionHeaderOffset: 0x1A0 # CHECK-NEXT: Flags [ (0x0) # CHECK-NEXT: ] # CHECK-NEXT: HeaderSize: 64 @@ -59,8 +59,8 @@ _start: # CHECK-NEXT: SHF_ALLOC (0x2) # CHECK-NEXT: SHF_EXECINSTR (0x4) # CHECK-NEXT: ] -# CHECK-NEXT: Address: 0x200000 -# CHECK-NEXT: Offset: 0x100000 +# CHECK-NEXT: Address: 0x200120 +# CHECK-NEXT: Offset: 0x120 # CHECK-NEXT: Size: 12 # CHECK-NEXT: Link: 0 # CHECK-NEXT: Info: 0 @@ -76,7 +76,7 @@ _start: # CHECK-NEXT: SHF_STRINGS (0x20) # CHECK-NEXT: ] # CHECK-NEXT: Address: 0x0 -# CHECK-NEXT: Offset: 0x10000C +# CHECK-NEXT: Offset: 0x12C # CHECK-NEXT: Size: 8 # CHECK-NEXT: Link: 0 # CHECK-NEXT: Info: 0 @@ -90,7 +90,7 @@ _start: # CHECK-NEXT: Flags [ (0x0) # CHECK-NEXT: ] # CHECK-NEXT: Address: 0x0 -# CHECK-NEXT: Offset: 0x100018 +# CHECK-NEXT: Offset: 0x138 # CHECK-NEXT: Size: 48 # CHECK-NEXT: Link: 5 # CHECK-NEXT: Info: 1 @@ -104,7 +104,7 @@ _start: # CHECK-NEXT: Flags [ (0x0) # CHECK-NEXT: ] # CHECK-NEXT: Address: 0x0 -# CHECK-NEXT: Offset: 0x100048 +# CHECK-NEXT: Offset: 0x168 # CHECK-NEXT: Size: 42 # CHECK-NEXT: Link: 0 # CHECK-NEXT: Info: 0 @@ -118,7 +118,7 @@ _start: # CHECK-NEXT: Flags [ (0x0) # CHECK-NEXT: ] # CHECK-NEXT: Address: 0x0 -# CHECK-NEXT: Offset: 0x100072 +# CHECK-NEXT: Offset: 0x192 # CHECK-NEXT: Size: 8 # CHECK-NEXT: Link: 0 # CHECK-NEXT: Info: 0 @@ -150,8 +150,8 @@ _start: # CHECK-NEXT: ProgramHeader { # CHECK-NEXT: Type: PT_PHDR (0x6) # CHECK-NEXT: Offset: 0x40 -# CHECK-NEXT: VirtualAddress: 0x100040 -# CHECK-NEXT: PhysicalAddress: 0x100040 +# CHECK-NEXT: VirtualAddress: 0x200040 +# CHECK-NEXT: PhysicalAddress: 0x200040 # CHECK-NEXT: FileSize: 224 # CHECK-NEXT: MemSize: 224 # CHECK-NEXT: Flags [ (0x4) @@ -162,8 +162,8 @@ _start: # CHECK-NEXT: ProgramHeader { # CHECK-NEXT: Type: PT_LOAD (0x1) # CHECK-NEXT: Offset: 0x0 -# CHECK-NEXT: VirtualAddress: 0x100000 -# CHECK-NEXT: PhysicalAddress: 0x100000 +# CHECK-NEXT: VirtualAddress: 0x200000 +# CHECK-NEXT: PhysicalAddress: 0x200000 # CHECK-NEXT: FileSize: 288 # CHECK-NEXT: MemSize: 288 # CHECK-NEXT: Flags [ @@ -173,9 +173,9 @@ _start: # CHECK-NEXT: } # CHECK-NEXT: ProgramHeader { # CHECK-NEXT: Type: PT_LOAD (0x1) -# CHECK-NEXT: Offset: 0x100000 -# CHECK-NEXT: VirtualAddress: 0x200000 -# CHECK-NEXT: PhysicalAddress: 0x200000 +# CHECK-NEXT: Offset: 0x120 +# CHECK-NEXT: VirtualAddress: 0x200120 +# CHECK-NEXT: PhysicalAddress: 0x200120 # CHECK-NEXT: FileSize: 12 # CHECK-NEXT: MemSize: 12 # CHECK-NEXT: Flags [ (0x5) diff --git a/lld/test/ELF/riscv-gp-dummy-sdata.s b/lld/test/ELF/riscv-gp-dummy-sdata.s deleted file mode 100644 index e04b170d5b2b97..00000000000000 --- a/lld/test/ELF/riscv-gp-dummy-sdata.s +++ /dev/null @@ -1,25 +0,0 @@ -# REQUIRES: riscv -# RUN: llvm-mc -filetype=obj -triple=riscv32 %s -o %t.32.o -# RUN: ld.lld -pie %t.32.o -o %t.32 -# RUN: llvm-readelf -S %t.32 | FileCheck --check-prefix=SEC %s -# RUN: llvm-readelf -s %t.32 | FileCheck --check-prefix=SYM %s - -# RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.64.o -# RUN: ld.lld -pie %t.64.o -o %t.64 -# RUN: llvm-readelf -S %t.64 | FileCheck --check-prefix=SEC %s -# RUN: llvm-readelf -s %t.64 | FileCheck --check-prefix=SYM %s - -## If there is an undefined reference to __global_pointer$ but .sdata doesn't -## exist, create a dummy one. - -## __global_pointer$ = .sdata+0x800 -# SEC: [ 7] .sdata PROGBITS {{0*}}00003000 -# SYM: {{0*}}00003800 0 NOTYPE GLOBAL DEFAULT 7 __global_pointer$ - -## If __global_pointer$ is not used, don't create .sdata . - -# RUN: llvm-mc -filetype=obj -triple=riscv32 /dev/null -o %t.32.o -# RUN: ld.lld -pie %t.32.o -o %t.32 -# RUN: llvm-readelf -S %t.32 | FileCheck --implicit-check-not=.sdata /dev/null - -lla gp, __global_pointer$ diff --git a/lld/test/ELF/riscv-gp-no-sdata.s b/lld/test/ELF/riscv-gp-no-sdata.s new file mode 100644 index 00000000000000..ee86438ec4f350 --- /dev/null +++ b/lld/test/ELF/riscv-gp-no-sdata.s @@ -0,0 +1,15 @@ +# REQUIRES: riscv +# RUN: llvm-mc -filetype=obj -triple=riscv32 %s -o %t.32.o +# RUN: ld.lld -pie %t.32.o -o %t.32 +# RUN: llvm-readelf -s %t.32 | FileCheck --check-prefix=SYM %s + +# RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.64.o +# RUN: ld.lld -pie %t.64.o -o %t.64 +# RUN: llvm-readelf -s %t.64 | FileCheck --check-prefix=SYM %s + +## If there is an undefined reference to __global_pointer$ but .sdata doesn't +## exist, define __global_pointer$ and set its st_shndx arbitrarily to 1. + +# SYM: {{0*}}00000800 0 NOTYPE GLOBAL DEFAULT 1 __global_pointer$ + +lla gp, __global_pointer$ diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h index 28635f820170f1..e526c1db26b229 100644 --- a/lldb/include/lldb/Interpreter/CommandObject.h +++ b/lldb/include/lldb/Interpreter/CommandObject.h @@ -228,25 +228,15 @@ class CommandObject { /// /// \param[in/out] request /// The completion request that needs to be answered. - /// - /// FIXME: This is the wrong return value, since we also need to make a - /// distinction between - /// total number of matches, and the window the user wants returned. virtual void HandleCompletion(CompletionRequest &request); - /// The input array contains a parsed version of the line. The insertion - /// point is given by cursor_index (the index in input of the word containing - /// the cursor) and cursor_char_position (the position of the cursor in that - /// word.) + /// The input array contains a parsed version of the line. + /// /// We've constructed the map of options and their arguments as well if that /// is helpful for the completion. /// /// \param[in/out] request /// The completion request that needs to be answered. - /// - /// FIXME: This is the wrong return value, since we also need to make a - /// distinction between - /// total number of matches, and the window the user wants returned. virtual void HandleArgumentCompletion(CompletionRequest &request, OptionElementVector &opt_element_vector) {} diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp index 6c528b119fe1ed..2829a22b7e497c 100644 --- a/lldb/source/Interpreter/Options.cpp +++ b/lldb/source/Interpreter/Options.cpp @@ -652,8 +652,7 @@ bool Options::HandleOptionCompletion(CompletionRequest &request, auto opt_defs = GetDefinitions(); - std::string cur_opt_std_str = request.GetCursorArgumentPrefix().str(); - const char *cur_opt_str = cur_opt_std_str.c_str(); + llvm::StringRef cur_opt_str = request.GetCursorArgumentPrefix(); for (size_t i = 0; i < opt_element_vector.size(); i++) { int opt_pos = opt_element_vector[i].opt_pos; @@ -667,7 +666,7 @@ bool Options::HandleOptionCompletion(CompletionRequest &request, // FIXME: We should scan the other options provided and only complete // options // within the option group they belong to. - char opt_str[3] = {'-', 'a', '\0'}; + std::string opt_str = "-a"; for (auto &def : opt_defs) { if (!def.short_option) @@ -685,7 +684,7 @@ bool Options::HandleOptionCompletion(CompletionRequest &request, full_name.erase(full_name.begin() + 2, full_name.end()); full_name.append(def.long_option); - request.AddCompletion(full_name.c_str()); + request.AddCompletion(full_name); } return true; } else if (opt_defs_index != OptionArgElement::eUnrecognizedArg) { @@ -693,17 +692,13 @@ bool Options::HandleOptionCompletion(CompletionRequest &request, // anyway (getopt_long_only is happy with shortest unique string, but // it's still a nice thing to do.) Otherwise return The string so the // upper level code will know this is a full match and add the " ". - if (cur_opt_str && strlen(cur_opt_str) > 2 && cur_opt_str[0] == '-' && - cur_opt_str[1] == '-' && - strcmp(opt_defs[opt_defs_index].long_option, cur_opt_str) != 0) { - std::string full_name("--"); - full_name.append(opt_defs[opt_defs_index].long_option); - request.AddCompletion(full_name.c_str()); + llvm::StringRef long_option = opt_defs[opt_defs_index].long_option; + if (cur_opt_str.startswith("--") && cur_opt_str != long_option) { + request.AddCompletion("--" + long_option.str()); return true; - } else { + } else request.AddCompletion(request.GetCursorArgument()); - return true; - } + return true; } else { // FIXME - not handling wrong options yet: // Check to see if they are writing a long option & complete it. @@ -712,16 +707,15 @@ bool Options::HandleOptionCompletion(CompletionRequest &request, // that are not unique up to this point. getopt_long_only does // shortest unique match for long options already. - if (cur_opt_str && strlen(cur_opt_str) > 2 && cur_opt_str[0] == '-' && - cur_opt_str[1] == '-') { + if (cur_opt_str.startswith("--")) { for (auto &def : opt_defs) { if (!def.long_option) continue; - if (strstr(def.long_option, cur_opt_str + 2) == def.long_option) { + if (cur_opt_str.startswith(def.long_option)) { std::string full_name("--"); full_name.append(def.long_option); - request.AddCompletion(full_name.c_str()); + request.AddCompletion(full_name); } } } diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index b4dc91bc3f3412..c8f794b4cc3c90 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -140,7 +140,16 @@ unsigned LLVMGetLastEnumAttributeKind(void) { LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID, uint64_t Val) { - return wrap(Attribute::get(*unwrap(C), (Attribute::AttrKind)KindID, Val)); + auto &Ctx = *unwrap(C); + auto AttrKind = (Attribute::AttrKind)KindID; + + if (AttrKind == Attribute::AttrKind::ByVal) { + // After r362128, byval attributes need to have a type attribute. Provide a + // NULL one until a proper API is added for this. + return wrap(Attribute::getWithByValType(Ctx, NULL)); + } else { + return wrap(Attribute::get(Ctx, AttrKind, Val)); + } } unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) { diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp index 5717a7102b692e..212c5a397b85d9 100644 --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -1,4 +1,4 @@ -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===// +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -18,13 +18,10 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" @@ -61,6 +58,7 @@ namespace { Value* RHS; bool Exchange = false; bool ReadOnly = true; + bool Paired = false; SmallVector VecLd; // Container for loads to widen. MulCandidate(Instruction *I, Value *lhs, Value *rhs) : @@ -71,7 +69,7 @@ namespace { } LoadInst *getBaseLoad() const { - return cast(LHS); + return VecLd.front(); } }; @@ -82,7 +80,7 @@ namespace { Value *Acc = nullptr; MulCandList Muls; MulPairList MulPairs; - SmallPtrSet Adds; + SetVector Adds; public: Reduction() = delete; @@ -92,10 +90,35 @@ namespace { /// Record an Add instruction that is a part of the this reduction. void InsertAdd(Instruction *I) { Adds.insert(I); } - /// Record a MulCandidate, rooted at a Mul instruction, that is a part of - /// this reduction. - void InsertMul(Instruction *I, Value *LHS, Value *RHS) { - Muls.push_back(std::make_unique(I, LHS, RHS)); + /// Create MulCandidates, each rooted at a Mul instruction, that is a part + /// of this reduction. + void InsertMuls() { + auto GetMulOperand = [](Value *V) -> Instruction* { + if (auto *SExt = dyn_cast(V)) { + if (auto *I = dyn_cast(SExt->getOperand(0))) + if (I->getOpcode() == Instruction::Mul) + return I; + } else if (auto *I = dyn_cast(V)) { + if (I->getOpcode() == Instruction::Mul) + return I; + } + return nullptr; + }; + + auto InsertMul = [this](Instruction *I) { + Value *LHS = cast(I->getOperand(0))->getOperand(0); + Value *RHS = cast(I->getOperand(1))->getOperand(0); + Muls.push_back(std::make_unique(I, LHS, RHS)); + }; + + for (auto *Add : Adds) { + if (Add == Acc) + continue; + if (auto *Mul = GetMulOperand(Add->getOperand(0))) + InsertMul(Mul); + if (auto *Mul = GetMulOperand(Add->getOperand(1))) + InsertMul(Mul); + } } /// Add the incoming accumulator value, returns true if a value had not @@ -110,7 +133,15 @@ namespace { /// Set two MulCandidates, rooted at muls, that can be executed as a single /// parallel operation. - void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1) { + void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1, + bool Exchange = false) { + LLVM_DEBUG(dbgs() << "Pairing:\n" + << *Mul0->Root << "\n" + << *Mul1->Root << "\n"); + Mul0->Paired = true; + Mul1->Paired = true; + if (Exchange) + Mul1->Exchange = true; MulPairs.push_back(std::make_pair(Mul0, Mul1)); } @@ -127,7 +158,7 @@ namespace { Value *getAccumulator() { return Acc; } /// Return the set of adds that comprise the reduction. - SmallPtrSetImpl &getAdds() { return Adds; } + SetVector &getAdds() { return Adds; } /// Return the MulCandidate, rooted at mul instruction, that comprise the /// the reduction. @@ -141,6 +172,18 @@ namespace { void UpdateRoot(Instruction *SMLAD) { Root->replaceAllUsesWith(SMLAD); } + + void dump() { + LLVM_DEBUG(dbgs() << "Reduction:\n"; + for (auto *Add : Adds) + LLVM_DEBUG(dbgs() << *Add << "\n"); + for (auto &Mul : Muls) + LLVM_DEBUG(dbgs() << *Mul->Root << "\n" + << " " << *Mul->LHS << "\n" + << " " << *Mul->RHS << "\n"); + LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n") + ); + } }; class WidenedLoad { @@ -158,13 +201,11 @@ namespace { } }; - class ARMParallelDSP : public LoopPass { + class ARMParallelDSP : public FunctionPass { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; DominatorTree *DT; - LoopInfo *LI; - Loop *L; const DataLayout *DL; Module *M; std::map LoadPairs; @@ -172,8 +213,8 @@ namespace { std::map> WideLoads; template - bool IsNarrowSequence(Value *V, Value *&Src); - + bool IsNarrowSequence(Value *V); + bool Search(Value *V, BasicBlock *BB, Reduction &R); bool RecordMemoryOps(BasicBlock *BB); void InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); @@ -185,63 +226,38 @@ namespace { /// products to a 32-bit accumulate operand. Optionally, the instruction can /// exchange the halfwords of the second operand before performing the /// arithmetic. - bool MatchSMLAD(Loop *L); + bool MatchSMLAD(Function &F); public: static char ID; - ARMParallelDSP() : LoopPass(ID) { } - - bool doInitialization(Loop *L, LPPassManager &LPM) override { - LoadPairs.clear(); - WideLoads.clear(); - return true; - } + ARMParallelDSP() : FunctionPass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { - LoopPass::getAnalysisUsage(AU); + FunctionPass::getAnalysisUsage(AU); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); } - bool runOnLoop(Loop *TheLoop, LPPassManager &) override { + bool runOnFunction(Function &F) override { if (DisableParallelDSP) return false; - if (skipLoop(TheLoop)) + if (skipFunction(F)) return false; - L = TheLoop; SE = &getAnalysis().getSE(); AA = &getAnalysis().getAAResults(); TLI = &getAnalysis().getTLI(); DT = &getAnalysis().getDomTree(); - LI = &getAnalysis().getLoopInfo(); auto &TPC = getAnalysis(); - BasicBlock *Header = TheLoop->getHeader(); - if (!Header) - return false; - - // TODO: We assume the loop header and latch to be the same block. - // This is not a fundamental restriction, but lifting this would just - // require more work to do the transformation and then patch up the CFG. - if (Header != TheLoop->getLoopLatch()) { - LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not " - "running pass ARMParallelDSP\n"); - return false; - } - - if (!TheLoop->getLoopPreheader()) - InsertPreheaderForLoop(L, DT, LI, nullptr, true); - - Function &F = *Header->getParent(); M = F.getParent(); DL = &M->getDataLayout(); @@ -266,17 +282,10 @@ namespace { return false; } - LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); - LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - if (!RecordMemoryOps(Header)) { - LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); - return false; - } - - bool Changes = MatchSMLAD(L); + bool Changes = MatchSMLAD(F); return Changes; } }; @@ -315,18 +324,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, // TODO: we currently only collect i16, and will support i8 later, so that's // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. template -bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) { +bool ARMParallelDSP::IsNarrowSequence(Value *V) { if (auto *SExt = dyn_cast(V)) { if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) return false; if (auto *Ld = dyn_cast(SExt->getOperand(0))) { - // Check that these load could be paired. - if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld)) - return false; - - Src = Ld; - return true; + // Check that this load could be paired. + return LoadPairs.count(Ld) || OffsetLoads.count(Ld); } } return false; @@ -337,6 +342,8 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) { bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector Loads; SmallVector Writes; + LoadPairs.clear(); + WideLoads.clear(); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -414,7 +421,54 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { return LoadPairs.size() > 1; } -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector +// Search recursively back through the operands to find a tree of values that +// form a multiply-accumulate chain. The search records the Add and Mul +// instructions that form the reduction and allows us to find a single value +// to be used as the initial input to the accumlator. +bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) { + // If we find a non-instruction, try to use it as the initial accumulator + // value. This may have already been found during the search in which case + // this function will return false, signaling a search fail. + auto *I = dyn_cast(V); + if (!I) + return R.InsertAcc(V); + + if (I->getParent() != BB) + return false; + + switch (I->getOpcode()) { + default: + break; + case Instruction::PHI: + // Could be the accumulator value. + return R.InsertAcc(V); + case Instruction::Add: { + // Adds should be adding together two muls, or another add and a mul to + // be within the mac chain. One of the operands may also be the + // accumulator value at which point we should stop searching. + R.InsertAdd(I); + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + bool ValidLHS = Search(LHS, BB, R); + bool ValidRHS = Search(RHS, BB, R); + + if (ValidLHS && ValidRHS) + return true; + + return R.InsertAcc(I); + } + case Instruction::Mul: { + Value *MulOp0 = I->getOperand(0); + Value *MulOp1 = I->getOperand(1); + return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1); + } + case Instruction::SExt: + return Search(I->getOperand(0), BB, R); + } + return false; +} + +// The pass needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: // 1) we first need to find integer add then look for this pattern: @@ -445,88 +499,39 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // -bool ARMParallelDSP::MatchSMLAD(Loop *L) { - // Search recursively back through the operands to find a tree of values that - // form a multiply-accumulate chain. The search records the Add and Mul - // instructions that form the reduction and allows us to find a single value - // to be used as the initial input to the accumlator. - std::function Search = [&] - (Value *V, Reduction &R) -> bool { - - // If we find a non-instruction, try to use it as the initial accumulator - // value. This may have already been found during the search in which case - // this function will return false, signaling a search fail. - auto *I = dyn_cast(V); - if (!I) - return R.InsertAcc(V); - - switch (I->getOpcode()) { - default: - break; - case Instruction::PHI: - // Could be the accumulator value. - return R.InsertAcc(V); - case Instruction::Add: { - // Adds should be adding together two muls, or another add and a mul to - // be within the mac chain. One of the operands may also be the - // accumulator value at which point we should stop searching. - bool ValidLHS = Search(I->getOperand(0), R); - bool ValidRHS = Search(I->getOperand(1), R); - if (!ValidLHS && !ValidLHS) - return false; - else if (ValidLHS && ValidRHS) { - R.InsertAdd(I); - return true; - } else { - R.InsertAdd(I); - return R.InsertAcc(I); - } - } - case Instruction::Mul: { - Value *MulOp0 = I->getOperand(0); - Value *MulOp1 = I->getOperand(1); - if (isa(MulOp0) && isa(MulOp1)) { - Value *LHS = nullptr; - Value *RHS = nullptr; - if (IsNarrowSequence<16>(MulOp0, LHS) && - IsNarrowSequence<16>(MulOp1, RHS)) { - R.InsertMul(I, LHS, RHS); - return true; - } - } - return false; - } - case Instruction::SExt: - return Search(I->getOperand(0), R); - } - return false; - }; - +bool ARMParallelDSP::MatchSMLAD(Function &F) { bool Changed = false; - SmallPtrSet AllAdds; - BasicBlock *Latch = L->getLoopLatch(); - for (Instruction &I : reverse(*Latch)) { - if (I.getOpcode() != Instruction::Add) + for (auto &BB : F) { + SmallPtrSet AllAdds; + if (!RecordMemoryOps(&BB)) continue; - if (AllAdds.count(&I)) - continue; + for (Instruction &I : reverse(BB)) { + if (I.getOpcode() != Instruction::Add) + continue; - const auto *Ty = I.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; + if (AllAdds.count(&I)) + continue; - Reduction R(&I); - if (!Search(&I, R)) - continue; + const auto *Ty = I.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; - if (!CreateParallelPairs(R)) - continue; + Reduction R(&I); + if (!Search(&I, &BB, R)) + continue; + + R.InsertMuls(); + LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump()); - InsertParallelMACs(R); - Changed = true; - AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + if (!CreateParallelPairs(R)) + continue; + + InsertParallelMACs(R); + Changed = true; + AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + } } return Changed; @@ -554,12 +559,6 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { auto Ld2 = static_cast(PMul0->RHS); auto Ld3 = static_cast(PMul1->RHS); - LLVM_DEBUG(dbgs() << "Loads:\n" - << " - " << *Ld0 << "\n" - << " - " << *Ld1 << "\n" - << " - " << *Ld2 << "\n" - << " - " << *Ld3 << "\n"); - if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); @@ -568,8 +567,7 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); - PMul1->Exchange = true; - R.AddMulPair(PMul0, PMul1); + R.AddMulPair(PMul0, PMul1, true); return true; } } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && @@ -577,9 +575,8 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); LLVM_DEBUG(dbgs() << " and swapping muls\n"); - PMul0->Exchange = true; // Only the second operand can be exchanged, so swap the muls. - R.AddMulPair(PMul1, PMul0); + R.AddMulPair(PMul1, PMul0, true); return true; } return false; @@ -587,10 +584,9 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { MulCandList &Muls = R.getMuls(); const unsigned Elems = Muls.size(); - SmallPtrSet Paired; for (unsigned i = 0; i < Elems; ++i) { MulCandidate *PMul0 = static_cast(Muls[i].get()); - if (Paired.count(PMul0->Root)) + if (PMul0->Paired) continue; for (unsigned j = 0; j < Elems; ++j) { @@ -598,7 +594,7 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { continue; MulCandidate *PMul1 = static_cast(Muls[j].get()); - if (Paired.count(PMul1->Root)) + if (PMul1->Paired) continue; const Instruction *Mul0 = PMul0->Root; @@ -608,11 +604,8 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { assert(PMul0 != PMul1 && "expected different chains"); - if (CanPair(R, PMul0, PMul1)) { - Paired.insert(Mul0); - Paired.insert(Mul1); + if (CanPair(R, PMul0, PMul1)) break; - } } } return !R.getMulPairs().empty(); @@ -646,18 +639,33 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) { Instruction *InsertAfter = R.getRoot(); Value *Acc = R.getAccumulator(); + + // For any muls that were discovered but not paired, accumulate their values + // as before. + IRBuilder Builder(InsertAfter->getParent(), + ++BasicBlock::iterator(InsertAfter)); + MulCandList &MulCands = R.getMuls(); + for (auto &MulCand : MulCands) { + if (MulCand->Paired) + continue; + + LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *MulCand->Root + << "\n"); + if (!Acc) { + Acc = MulCand->Root; + continue; + } + Acc = Builder.CreateAdd(MulCand->Root, Acc); + InsertAfter = cast(Acc); + } + if (!Acc) Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); IntegerType *Ty = IntegerType::get(M->getContext(), 32); - LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n" - << "Acc: " << *Acc << "\n"); for (auto &Pair : R.getMulPairs()) { MulCandidate *LHSMul = Pair.first; MulCandidate *RHSMul = Pair.second; - LLVM_DEBUG(dbgs() << "Muls:\n" - << "- " << *LHSMul->Root << "\n" - << "- " << *RHSMul->Root << "\n"); LoadInst *BaseLHS = LHSMul->getBaseLoad(); LoadInst *BaseRHS = RHSMul->getBaseLoad(); LoadInst *WideLHS = WideLoads.count(BaseLHS) ? @@ -724,14 +732,25 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. // TODO: Support big-endian as well. Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); - BaseSExt->setOperand(0, Bottom); + Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); + BaseSExt->replaceAllUsesWith(NewBaseSExt); IntegerType *OffsetTy = cast(Offset->getType()); Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); - OffsetSExt->setOperand(0, Trunc); - + Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); + OffsetSExt->replaceAllUsesWith(NewOffsetSExt); + + LLVM_DEBUG(dbgs() << "From Base and Offset:\n" + << *Base << "\n" << *Offset << "\n" + << "Created Wide Load:\n" + << *WideLoad << "\n" + << *Bottom << "\n" + << *NewBaseSExt << "\n" + << *Top << "\n" + << *Trunc << "\n" + << *NewOffsetSExt << "\n"); WideLoads.emplace(std::make_pair(Base, std::make_unique(Loads, WideLoad))); return WideLoad; @@ -744,6 +763,6 @@ Pass *llvm::createARMParallelDSPPass() { char ARMParallelDSP::ID = 0; INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 7ea842baa5e1d3..21e75d55a8c032 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1176,18 +1176,17 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { return false; } - // TODO: handle reductions when tail is folded by masking. - if (!Reductions.empty()) { - reportVectorizationFailure( - "Loop has reductions, cannot fold tail by masking", - "Cannot fold tail by masking in the presence of reductions.", - "ReductionFoldingTailByMasking", ORE, TheLoop); - return false; - } + SmallPtrSet ReductionLiveOuts; - // TODO: handle outside users when tail is folded by masking. + for (auto &Reduction : *getReductionVars()) + ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); + + // TODO: handle non-reduction outside users when tail is folded by masking. for (auto *AE : AllowedExit) { - // Check that all users of allowed exit values are inside the loop. + // Check that all users of allowed exit values are inside the loop or + // are the live-out of a reduction. + if (ReductionLiveOuts.count(AE)) + continue; for (User *U : AE->users()) { Instruction *UI = cast(U); if (TheLoop->contains(UI)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 870ac70057107c..478174f8251c1c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3678,6 +3678,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { setDebugLocFromInst(Builder, LoopExitInst); + // If tail is folded by masking, the vector value to leave the loop should be + // a Select choosing between the vectorized LoopExitInst and vectorized Phi, + // instead of the former. + if (Cost->foldTailByMasking()) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VecLoopExitInst = + VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *Sel = nullptr; + for (User *U : VecLoopExitInst->users()) { + if (isa(U)) { + assert(!Sel && "Reduction exit feeding two selects"); + Sel = U; + } else + assert(isa(U) && "Reduction exit must feed Phi's or select"); + } + assert(Sel && "Reduction exit feeds no select"); + VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + } + } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -6939,8 +6959,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, // If the tail is to be folded by masking, the primary induction variable // needs to be represented in VPlan for it to model early-exit masking. - if (CM.foldTailByMasking()) + // Also, both the Phi and the live-out instruction of each reduction are + // required in order to introduce a select between them in VPlan. + if (CM.foldTailByMasking()) { NeedDef.insert(Legal->getPrimaryInduction()); + for (auto &Reduction : *Legal->getReductionVars()) { + NeedDef.insert(Reduction.first); + NeedDef.insert(Reduction.second.getLoopExitInstr()); + } + } // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -7067,6 +7094,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // Finally, if tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the end of the latch. + if (CM.foldTailByMasking()) { + Builder.setInsertPoint(VPBB); + auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + for (auto &Reduction : *Legal->getReductionVars()) { + VPValue *Phi = Plan->getVPValue(Reduction.first); + VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); + Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); + } + } + std::string PlanName; raw_string_ostream RSO(PlanName); unsigned VF = Range.Start; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 517d759d7bfce4..14adb478cd8636 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -309,6 +309,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, V, Part); break; } + case Instruction::Select: { + Value *Cond = State.get(getOperand(0), Part); + Value *Op1 = State.get(getOperand(1), Part); + Value *Op2 = State.get(getOperand(2), Part); + Value *V = Builder.CreateSelect(Cond, Op1, Op2); + State.set(this, V, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index ec96f055a056d1..c528f5d0ceee92 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -37,8 +37,7 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Transform loops to use DSP intrinsics +; CHECK-NEXT: Transform functions to use DSP intrinsics ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: ARM IR optimizations ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll new file mode 100644 index 00000000000000..d9dbd960974248 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -0,0 +1,79 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: single_block +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) +define i32 @single_block(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: multi_block +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) +define i32 @multi_block(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + br label %bb.1 + +bb.1: + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: multi_block_1 +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + br label %bb.1 + +bb.1: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll new file mode 100644 index 00000000000000..c072df49cdf2dc --- /dev/null +++ b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -0,0 +1,329 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: exchange_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.1, %sext.a.0 + %mul.1 = mul i32 %sext.b.0, %sext.a.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.1, %sext.a.0 + %mul.1 = mul i32 %sext.b.0, %sext.a.1 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) +define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.3, %sext.b.1 + %mul.3 = mul i32 %sext.a.2, %sext.b.0 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) +define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; TODO: Why aren't two intrinsics generated? +; CHECK-LABEL: exchange_multi_use_3 +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK-NOT: call i32 @llvm.arm.smlad +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 +define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %sub = sub i32 %add, %add.1 + %res = add i32 %acc, %sub + ret i32 %res +} + +; TODO: Why isn't smladx generated too? +; CHECK-LABEL: exchange_multi_use_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add.1 = add i32 %mul.2, %mul.3 + %add = add i32 %mul.0, %mul.1 + %sub = sub i32 %add, %add.1 + %res = add i32 %acc, %sub + ret i32 %res +} + +; CHECK-LABEL: exchange_swap +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.0 + %mul.1 = mul i32 %sext.a.0, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_swap_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.0 + %mul.1 = mul i32 %sext.a.0, %sext.b.1 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_swap_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.0, %sext.a.1 + %mul.1 = mul i32 %sext.b.1, %sext.a.0 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll new file mode 100644 index 00000000000000..a071ec3e748f2b --- /dev/null +++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -0,0 +1,172 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: overlap_1 +; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* +; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]]) +; CHECK: ret i32 [[RES]] +define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_2 +; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC1:%[^ ]+]] = add i32 %mul.1, %acc +; CHECK: [[ACC2:%[^ ]+]] = add i32 %mul.2, [[ACC1]] +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC2]]) +; CHECK: ret i32 [[RES]] +define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_3 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.1 + %mul.3 = mul i32 %sext.a.3, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_4 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %mul.3 = mul i32 %sext.b.1, %sext.a.3 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll index e422eadd20c8ce..9f032cd24857c9 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll @@ -9,7 +9,7 @@ ; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 6 ; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32* ; CHECK: [[LOAD_UNDEF:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2 -; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_UNDEF]], i32 undef) +; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_UNDEF]], i32 [[LOAD_A]], i32 undef) define void @undef_no_return(i16* %a) { entry: %incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3 @@ -48,7 +48,7 @@ for.body: ; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 %iv ; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32* ; CHECK: [[LOAD_B:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2 -; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_B]], i32 [[ACC]]) +; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_B]], i32 [[LOAD_A]], i32 [[ACC]]) define i32 @return(i16* %a, i8* %b, i32 %N) { entry: %incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll new file mode 100644 index 00000000000000..a2f4745c68aef7 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -0,0 +1,145 @@ +; RUN: opt -mtriple=thumbv7-unknown-linux-gnueabihf -arm-parallel-dsp -dce %s -S -o - | FileCheck %s + +; CHECK-LABEL: first_mul_invalid +; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1 +; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2 +; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32 +; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1 +; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2 +; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32 +; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]] +; CHECK: [[ADD0:%[^ ]+]] = add i32 [[MUL0]], %call +; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3 +; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32* +; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2 +; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 +; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* +; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 +; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* +; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 +; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 +; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* +; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: ret i32 [[RES]] +define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) { +entry: + %0 = load i16, i16* %in, align 2 + %conv = sext i16 %0 to i32 + %1 = load i16, i16* %b, align 2 + %conv2 = sext i16 %1 to i32 + %call = tail call i32 @bar(i32 %conv, i32 %conv2) + %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1 + %2 = load i16, i16* %arrayidx3, align 2 + %conv4 = sext i16 %2 to i32 + %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1 + %3 = load i16, i16* %arrayidx5, align 2 + %conv6 = sext i16 %3 to i32 + %mul = mul nsw i32 %conv6, %conv4 + %add = add i32 %mul, %call + %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2 + %4 = load i16, i16* %arrayidx7, align 2 + %conv8 = sext i16 %4 to i32 + %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2 + %5 = load i16, i16* %arrayidx9, align 2 + %conv10 = sext i16 %5 to i32 + %mul11 = mul nsw i32 %conv10, %conv8 + %add12 = add i32 %add, %mul11 + %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3 + %6 = load i16, i16* %arrayidx13, align 2 + %conv14 = sext i16 %6 to i32 + %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3 + %7 = load i16, i16* %arrayidx15, align 2 + %conv16 = sext i16 %7 to i32 + %mul17 = mul nsw i32 %conv16, %conv14 + %add18 = add i32 %add12, %mul17 + %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = sext i16 %8 to i32 + %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4 + %9 = load i16, i16* %arrayidx21, align 2 + %conv22 = sext i16 %9 to i32 + %mul23 = mul nsw i32 %conv22, %conv20 + %add24 = add i32 %add18, %mul23 + %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5 + %10 = load i16, i16* %arrayidx25, align 2 + %conv26 = sext i16 %10 to i32 + %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5 + %11 = load i16, i16* %arrayidx27, align 2 + %conv28 = sext i16 %11 to i32 + %mul29 = mul nsw i32 %conv28, %conv26 + %add30 = add i32 %add24, %mul29 + ret i32 %add30 +} + +; CHECK-LABEL: with_no_acc_input +; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1 +; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2 +; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32 +; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1 +; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2 +; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32 +; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]] +; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3 +; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32* +; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2 +; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 +; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* +; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 +; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* +; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 +; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 +; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* +; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: ret i32 [[RES]] +define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) { +entry: + %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1 + %ld.2 = load i16, i16* %arrayidx3, align 2 + %conv4 = sext i16 %ld.2 to i32 + %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1 + %ld.3 = load i16, i16* %arrayidx5, align 2 + %conv6 = sext i16 %ld.3 to i32 + %mul = mul nsw i32 %conv6, %conv4 + %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2 + %ld.4 = load i16, i16* %arrayidx7, align 2 + %conv8 = sext i16 %ld.4 to i32 + %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2 + %ld.5 = load i16, i16* %arrayidx9, align 2 + %conv10 = sext i16 %ld.5 to i32 + %mul11 = mul nsw i32 %conv10, %conv8 + %add12 = add i32 %mul, %mul11 + %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3 + %ld.6 = load i16, i16* %arrayidx13, align 2 + %conv14 = sext i16 %ld.6 to i32 + %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3 + %ld.7 = load i16, i16* %arrayidx15, align 2 + %conv16 = sext i16 %ld.7 to i32 + %mul17 = mul nsw i32 %conv16, %conv14 + %add18 = add i32 %add12, %mul17 + %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4 + %ld.8 = load i16, i16* %arrayidx19, align 2 + %conv20 = sext i16 %ld.8 to i32 + %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4 + %ld.9 = load i16, i16* %arrayidx21, align 2 + %conv22 = sext i16 %ld.9 to i32 + %mul23 = mul nsw i32 %conv22, %conv20 + %add24 = add i32 %add18, %mul23 + %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5 + %ld.10 = load i16, i16* %arrayidx25, align 2 + %conv26 = sext i16 %ld.10 to i32 + %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5 + %ld.11 = load i16, i16* %arrayidx27, align 2 + %conv28 = sext i16 %ld.11 to i32 + %mul29 = mul nsw i32 %conv28, %conv26 + %add30 = add i32 %add24, %mul29 + ret i32 %add30 +} + +declare dso_local i32 @bar(i32, i32) local_unnamed_addr + diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll index b17106e70ed02e..22744be02b0b77 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -12,8 +12,8 @@ ; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 ; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32* ; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 -; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054) -; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[V12]]) +; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054) +; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]]) ; ; And we don't want to see a 3rd smlad: ; CHECK-NOT: call i32 @llvm.arm.smlad diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll index d4e09ca3fbb114..637fc3d37046b0 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll @@ -2,7 +2,7 @@ ; ; The loop header is not the loop latch. ; -; CHECK-NOT: call i32 @llvm.arm.smlad +; CHECK: call i32 @llvm.arm.smlad ; define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll index 971c85f1b665bf..07cc1b41ed26c2 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll @@ -195,8 +195,8 @@ for.cond.cleanup: ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN1]], i32 [[IN2_2]], i64 [[ACC1]]) +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll index eb0b499f512eff..d9db8a243a3a43 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -78,6 +78,62 @@ for.body: br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 } +; Check that fold tail under optsize passes the reduction live-out value +; through a select. +; int reduction_i32(int *A, int *B, int N) { +; int sum = 0; +; for (int i = 0; i < N; ++i) +; sum += (A[i] + B[i]); +; return sum; +; } +; +define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) #0 { +; CHECK-LABEL: @reduction_i32( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ [[ACCUM:%.*]], %vector.body ] +; CHECK: [[ICMPULE:%.*]] = icmp ule <8 x i64> +; CHECK: [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) +; CHECK: [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) +; CHECK-NEXT: [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: [[ACCUM]] = add nuw nsw <8 x i32> [[ADD]], [[ACCUM_PHI]] +; CHECK: [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[LIVEOUT]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[LIVEOUT]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF4]] +; CHECK-NEXT: [[RDX_SHUF6:%.*]] = shufflevector <8 x i32> [[BIN_RDX5]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[BIN_RDX5]], [[RDX_SHUF6]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[BIN_RDX7]], i32 0 +; CHECK-NEXT: br i1 true, label %for.cond.cleanup, label %scalar.ph +; CHECK: scalar.ph: +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ {{.*}}, %for.body ], [ [[TMP17]], %middle.block ] +; CHECK-NEXT: ret i32 [[SUM_1_LCSSA]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.0 = phi i32 [ %sum.1, %for.body ], [ 0, %entry ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidxA, align 4 + %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %1 = load i32, i32* %arrayidxB, align 4 + %add = add nsw i32 %1, %0 + %sum.1 = add nuw nsw i32 %add, %sum.0 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %sum.1 +} + ; CHECK: !0 = distinct !{!0, !1} ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-NEXT: !2 = distinct !{!2, !3, !1}