diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index 460883b84391fd..c4ecc5286ba9e6 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -80,6 +80,11 @@ struct UpdateIndexCallbacks : public ParsingCallbacks {
     });
   }
 
+  void onFailedAST(PathRef Path, std::vector<Diag> Diags,
+                   PublishFn Publish) override {
+    Publish([&]() { DiagConsumer.onDiagnosticsReady(Path, Diags); });
+  }
+
   void onFileUpdated(PathRef File, const TUStatus &Status) override {
     DiagConsumer.onFileUpdated(File, Status);
   }
diff --git a/clang-tools-extra/clangd/ClangdUnit.cpp b/clang-tools-extra/clangd/ClangdUnit.cpp
index f85cac200da38a..fd5202f3751737 100644
--- a/clang-tools-extra/clangd/ClangdUnit.cpp
+++ b/clang-tools-extra/clangd/ClangdUnit.cpp
@@ -292,7 +292,8 @@ void dumpAST(ParsedAST &AST, llvm::raw_ostream &OS) {
 }
 
 llvm::Optional<ParsedAST>
-ParsedAST::build(std::unique_ptr<CompilerInvocation> CI,
+ParsedAST::build(std::unique_ptr<clang::CompilerInvocation> CI,
+                 llvm::ArrayRef<Diag> CompilerInvocationDiags,
                  std::shared_ptr<const PreambleData> Preamble,
                  std::unique_ptr<llvm::MemoryBuffer> Buffer,
                  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
@@ -459,10 +460,15 @@ ParsedAST::build(std::unique_ptr<CompilerInvocation> CI,
   // So just inform the preprocessor of EOF, while keeping everything alive.
   Clang->getPreprocessor().EndSourceFile();
 
-  std::vector<Diag> Diags = ASTDiags.take(CTContext.getPointer());
+  std::vector<Diag> Diags = CompilerInvocationDiags;
   // Add diagnostics from the preamble, if any.
   if (Preamble)
-    Diags.insert(Diags.begin(), Preamble->Diags.begin(), Preamble->Diags.end());
+    Diags.insert(Diags.end(), Preamble->Diags.begin(), Preamble->Diags.end());
+  // Finally, add diagnostics coming from the AST.
+  {
+    std::vector<Diag> D = ASTDiags.take(CTContext.getPointer());
+    Diags.insert(Diags.end(), D.begin(), D.end());
+  }
   return ParsedAST(std::move(Preamble), std::move(Clang), std::move(Action),
                    std::move(Tokens), std::move(ParsedDecls), std::move(Diags),
                    std::move(Includes), std::move(CanonIncludes));
@@ -646,6 +652,7 @@ buildPreamble(PathRef FileName, CompilerInvocation &CI,
 
 llvm::Optional<ParsedAST>
 buildAST(PathRef FileName, std::unique_ptr<CompilerInvocation> Invocation,
+         llvm::ArrayRef<Diag> CompilerInvocationDiags,
          const ParseInputs &Inputs,
          std::shared_ptr<const PreambleData> Preamble) {
   trace::Span Tracer("BuildAST");
@@ -661,7 +668,8 @@ buildAST(PathRef FileName, std::unique_ptr<CompilerInvocation> Invocation,
   }
 
   return ParsedAST::build(
-      std::make_unique<CompilerInvocation>(*Invocation), Preamble,
+      std::make_unique<CompilerInvocation>(*Invocation),
+      CompilerInvocationDiags, Preamble,
       llvm::MemoryBuffer::getMemBufferCopy(Inputs.Contents, FileName),
       std::move(VFS), Inputs.Index, Inputs.Opts);
 }
diff --git a/clang-tools-extra/clangd/ClangdUnit.h b/clang-tools-extra/clangd/ClangdUnit.h
index f5b18f97387f9d..3af34b019648de 100644
--- a/clang-tools-extra/clangd/ClangdUnit.h
+++ b/clang-tools-extra/clangd/ClangdUnit.h
@@ -25,6 +25,7 @@
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/Tooling/Syntax/Tokens.h"
+#include "llvm/ADT/ArrayRef.h"
 #include <memory>
 #include <string>
 #include <vector>
@@ -76,10 +77,11 @@ class ParsedAST {
   /// it is reused during parsing.
   static llvm::Optional<ParsedAST>
   build(std::unique_ptr<clang::CompilerInvocation> CI,
+        llvm::ArrayRef<Diag> CompilerInvocationDiags,
         std::shared_ptr<const PreambleData> Preamble,
         std::unique_ptr<llvm::MemoryBuffer> Buffer,
-        IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, const SymbolIndex *Index,
-        const ParseOptions &Opts);
+        llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
+        const SymbolIndex *Index, const ParseOptions &Opts);
 
   ParsedAST(ParsedAST &&Other);
   ParsedAST &operator=(ParsedAST &&Other);
@@ -174,6 +176,7 @@ buildPreamble(PathRef FileName, CompilerInvocation &CI,
 /// result of calling buildPreamble.
 llvm::Optional<ParsedAST>
 buildAST(PathRef FileName, std::unique_ptr<CompilerInvocation> Invocation,
+         llvm::ArrayRef<Diag> CompilerInvocationDiags,
          const ParseInputs &Inputs,
          std::shared_ptr<const PreambleData> Preamble);
 
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index b5304dbffe74a6..045320fc543f34 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1053,7 +1053,9 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
   ParseInput.FS = VFS;
   ParseInput.Contents = Input.Contents;
   ParseInput.Opts = ParseOptions();
-  auto CI = buildCompilerInvocation(ParseInput);
+
+  IgnoreDiagnostics IgnoreDiags;
+  auto CI = buildCompilerInvocation(ParseInput, IgnoreDiags);
   if (!CI) {
     elog("Couldn't create CompilerInvocation");
     return false;
@@ -1084,12 +1086,11 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
   bool CompletingInPreamble = PreambleRegion.Size > Input.Offset;
   // NOTE: we must call BeginSourceFile after prepareCompilerInstance. Otherwise
   // the remapped buffers do not get freed.
-  IgnoreDiagnostics DummyDiagsConsumer;
   auto Clang = prepareCompilerInstance(
       std::move(CI),
       (Input.Preamble && !CompletingInPreamble) ? &Input.Preamble->Preamble
                                                 : nullptr,
-      std::move(ContentsBuffer), std::move(VFS), DummyDiagsConsumer);
+      std::move(ContentsBuffer), std::move(VFS), IgnoreDiags);
   Clang->getPreprocessorOpts().SingleFileParseMode = CompletingInPreamble;
   Clang->setCodeCompletionConsumer(Consumer.release());
 
diff --git a/clang-tools-extra/clangd/Compiler.cpp b/clang-tools-extra/clangd/Compiler.cpp
index 7080e20e879e76..e0801433319076 100644
--- a/clang-tools-extra/clangd/Compiler.cpp
+++ b/clang-tools-extra/clangd/Compiler.cpp
@@ -41,7 +41,8 @@ void IgnoreDiagnostics::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
 }
 
 std::unique_ptr<CompilerInvocation>
-buildCompilerInvocation(const ParseInputs &Inputs) {
+buildCompilerInvocation(const ParseInputs &Inputs,
+                        clang::DiagnosticConsumer &D) {
   std::vector<const char *> ArgStrs;
   for (const auto &S : Inputs.CompileCommand.CommandLine)
     ArgStrs.push_back(S.c_str());
@@ -52,12 +53,8 @@ buildCompilerInvocation(const ParseInputs &Inputs) {
     // dirs.
   }
 
-  // FIXME(ibiryukov): store diagnostics from CommandLine when we start
-  // reporting them.
-  IgnoreDiagnostics IgnoreDiagnostics;
   llvm::IntrusiveRefCntPtr<DiagnosticsEngine> CommandLineDiagsEngine =
-      CompilerInstance::createDiagnostics(new DiagnosticOptions,
-                                          &IgnoreDiagnostics, false);
+      CompilerInstance::createDiagnostics(new DiagnosticOptions, &D, false);
   std::unique_ptr<CompilerInvocation> CI = createInvocationFromCommandLine(
       ArgStrs, CommandLineDiagsEngine, Inputs.FS,
       /*ShouldRecoverOnErrors=*/true);
diff --git a/clang-tools-extra/clangd/Compiler.h b/clang-tools-extra/clangd/Compiler.h
index c24ea3546c5c46..689514ab4801c8 100644
--- a/clang-tools-extra/clangd/Compiler.h
+++ b/clang-tools-extra/clangd/Compiler.h
@@ -52,7 +52,8 @@ struct ParseInputs {
 
 /// Builds compiler invocation that could be used to build AST or preamble.
 std::unique_ptr<CompilerInvocation>
-buildCompilerInvocation(const ParseInputs &Inputs);
+buildCompilerInvocation(const ParseInputs &Inputs,
+                        clang::DiagnosticConsumer &D);
 
 /// Creates a compiler instance, configured so that:
 ///   - Contents of the parsed file are remapped to \p MainFile.
diff --git a/clang-tools-extra/clangd/Diagnostics.cpp b/clang-tools-extra/clangd/Diagnostics.cpp
index 7f1ab06db9d1d3..c9e1ed6bc6872c 100644
--- a/clang-tools-extra/clangd/Diagnostics.cpp
+++ b/clang-tools-extra/clangd/Diagnostics.cpp
@@ -16,11 +16,13 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticIDs.h"
 #include "clang/Basic/FileManager.h"
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Token.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Capacity.h"
@@ -393,6 +395,9 @@ int getSeverity(DiagnosticsEngine::Level L) {
 }
 
 std::vector<Diag> StoreDiags::take(const clang::tidy::ClangTidyContext *Tidy) {
+  // Do not forget to emit a pending diagnostic if there is one.
+  flushLastDiag();
+
   // Fill in name/source now that we have all the context needed to map them.
   for (auto &Diag : Output) {
     if (const char *ClangDiag = getDiagnosticCode(Diag.ID)) {
@@ -448,7 +453,6 @@ void StoreDiags::BeginSourceFile(const LangOptions &Opts,
 }
 
 void StoreDiags::EndSourceFile() {
-  flushLastDiag();
   LangOpts = None;
 }
 
@@ -467,10 +471,46 @@ static void writeCodeToFixMessage(llvm::raw_ostream &OS, llvm::StringRef Code) {
     OS << "…";
 }
 
+/// Fills \p D with all information, except the location-related bits.
+/// Also note that ID and Name are not part of clangd::DiagBase and should be
+/// set elsewhere.
+static void fillNonLocationData(DiagnosticsEngine::Level DiagLevel,
+                                const clang::Diagnostic &Info,
+                                clangd::DiagBase &D) {
+  llvm::SmallString<64> Message;
+  Info.FormatDiagnostic(Message);
+
+  D.Message = Message.str();
+  D.Severity = DiagLevel;
+  D.Category = DiagnosticIDs::getCategoryNameFromID(
+                   DiagnosticIDs::getCategoryNumberForDiag(Info.getID()))
+                   .str();
+}
+
 void StoreDiags::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
                                   const clang::Diagnostic &Info) {
   DiagnosticConsumer::HandleDiagnostic(DiagLevel, Info);
 
+  if (Info.getLocation().isInvalid()) {
+    // Handle diagnostics coming from command-line arguments. The source manager
+    // is *not* available at this point, so we cannot use it.
+    if (DiagLevel < DiagnosticsEngine::Level::Error) {
+      IgnoreDiagnostics::log(DiagLevel, Info);
+      return; // non-errors add too much noise, do not show them.
+    }
+
+    flushLastDiag();
+
+    LastDiag = Diag();
+    LastDiag->ID = Info.getID();
+    fillNonLocationData(DiagLevel, Info, *LastDiag);
+    LastDiag->InsideMainFile = true;
+    // Put it at the start of the main file, for a lack of a better place.
+    LastDiag->Range.start = Position{0, 0};
+    LastDiag->Range.end = Position{0, 0};
+    return;
+  }
+
   if (!LangOpts || !Info.hasSourceManager()) {
     IgnoreDiagnostics::log(DiagLevel, Info);
     return;
@@ -480,18 +520,13 @@ void StoreDiags::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
   SourceManager &SM = Info.getSourceManager();
 
   auto FillDiagBase = [&](DiagBase &D) {
-    D.Range = diagnosticRange(Info, *LangOpts);
-    llvm::SmallString<64> Message;
-    Info.FormatDiagnostic(Message);
-    D.Message = Message.str();
+    fillNonLocationData(DiagLevel, Info, D);
+
     D.InsideMainFile = InsideMainFile;
+    D.Range = diagnosticRange(Info, *LangOpts);
     D.File = SM.getFilename(Info.getLocation());
     D.AbsFile = getCanonicalPath(
         SM.getFileEntryForID(SM.getFileID(Info.getLocation())), SM);
-    D.Severity = DiagLevel;
-    D.Category = DiagnosticIDs::getCategoryNameFromID(
-                     DiagnosticIDs::getCategoryNumberForDiag(Info.getID()))
-                     .str();
     return D;
   };
 
@@ -564,7 +599,6 @@ void StoreDiags::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
     LastDiag = Diag();
     LastDiag->ID = Info.getID();
     FillDiagBase(*LastDiag);
-    LastDiagWasAdjusted = false;
     if (!InsideMainFile)
       LastDiagWasAdjusted = adjustDiagFromHeader(*LastDiag, Info, *LangOpts);
 
@@ -617,6 +651,7 @@ void StoreDiags::flushLastDiag() {
     vlog("Dropped diagnostic: {0}: {1}", LastDiag->File, LastDiag->Message);
   }
   LastDiag.reset();
+  LastDiagWasAdjusted = false;
 }
 
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp
index a09bf3f6a43d77..7052feceb35b9e 100644
--- a/clang-tools-extra/clangd/TUScheduler.cpp
+++ b/clang-tools-extra/clangd/TUScheduler.cpp
@@ -44,6 +44,7 @@
 #include "TUScheduler.h"
 #include "Cancellation.h"
 #include "Compiler.h"
+#include "Diagnostics.h"
 #include "GlobalCompilationDatabase.h"
 #include "Logger.h"
 #include "Trace.h"
@@ -365,6 +366,14 @@ ASTWorker::~ASTWorker() {
 void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) {
   llvm::StringRef TaskName = "Update";
   auto Task = [=]() mutable {
+    auto RunPublish = [&](llvm::function_ref<void()> Publish) {
+      // Ensure we only publish results from the worker if the file was not
+      // removed, making sure there are not race conditions.
+      std::lock_guard<std::mutex> Lock(PublishMu);
+      if (CanPublishResults)
+        Publish();
+    };
+
     // Get the actual command as `Inputs` does not have a command.
     // FIXME: some build systems like Bazel will take time to preparing
     // environment to build the file, it would be nice if we could emit a
@@ -394,8 +403,11 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) {
         Inputs.CompileCommand.Directory,
         llvm::join(Inputs.CompileCommand.CommandLine, " "));
     // Rebuild the preamble and the AST.
+    StoreDiags CompilerInvocationDiagConsumer;
     std::unique_ptr<CompilerInvocation> Invocation =
-        buildCompilerInvocation(Inputs);
+        buildCompilerInvocation(Inputs, CompilerInvocationDiagConsumer);
+    std::vector<Diag> CompilerInvocationDiags =
+        CompilerInvocationDiagConsumer.take();
     if (!Invocation) {
       elog("Could not build CompilerInvocation for file {0}", FileName);
       // Remove the old AST if it's still in cache.
@@ -403,6 +415,9 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) {
       TUStatus::BuildDetails Details;
       Details.BuildFailed = true;
       emitTUStatus({TUAction::BuildingPreamble, TaskName}, &Details);
+      // Report the diagnostics we collected when parsing the command line.
+      Callbacks.onFailedAST(FileName, std::move(CompilerInvocationDiags),
+                            RunPublish);
       // Make sure anyone waiting for the preamble gets notified it could not
       // be built.
       PreambleWasBuilt.notify();
@@ -468,7 +483,8 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) {
     llvm::Optional<std::unique_ptr<ParsedAST>> AST = IdleASTs.take(this);
     if (!AST) {
       llvm::Optional<ParsedAST> NewAST =
-          buildAST(FileName, std::move(Invocation), Inputs, NewPreamble);
+          buildAST(FileName, std::move(Invocation), CompilerInvocationDiags,
+                   Inputs, NewPreamble);
       AST = NewAST ? std::make_unique<ParsedAST>(std::move(*NewAST)) : nullptr;
       if (!(*AST)) { // buildAST fails.
         TUStatus::BuildDetails Details;
@@ -481,22 +497,22 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) {
       Details.ReuseAST = true;
       emitTUStatus({TUAction::BuildingFile, TaskName}, &Details);
     }
+
     // We want to report the diagnostics even if this update was cancelled.
     // It seems more useful than making the clients wait indefinitely if they
     // spam us with updates.
     // Note *AST can still be null if buildAST fails.
     if (*AST) {
       trace::Span Span("Running main AST callback");
-      auto RunPublish = [&](llvm::function_ref<void()> Publish) {
-        // Ensure we only publish results from the worker if the file was not
-        // removed, making sure there are not race conditions.
-        std::lock_guard<std::mutex> Lock(PublishMu);
-        if (CanPublishResults)
-          Publish();
-      };
 
       Callbacks.onMainAST(FileName, **AST, RunPublish);
       RanASTCallback = true;
+    } else {
+      // Failed to build the AST, at least report diagnostics from the command
+      // line if there were any.
+      // FIXME: we might have got more errors while trying to build the AST,
+      //        surface them too.
+      Callbacks.onFailedAST(FileName, CompilerInvocationDiags, RunPublish);
     }
     // Stash the AST in the cache for further use.
     IdleASTs.put(this, std::move(*AST));
@@ -513,14 +529,16 @@ void ASTWorker::runWithAST(
     llvm::Optional<std::unique_ptr<ParsedAST>> AST = IdleASTs.take(this);
     auto CurrentInputs = getCurrentFileInputs();
     if (!AST) {
-      std::unique_ptr<CompilerInvocation> Invocation =
-          buildCompilerInvocation(*CurrentInputs);
+      StoreDiags CompilerInvocationDiagConsumer;
+      std::unique_ptr<CompilerInvocation> Invocation = buildCompilerInvocation(
+          *CurrentInputs, CompilerInvocationDiagConsumer);
       // Try rebuilding the AST.
       llvm::Optional<ParsedAST> NewAST =
           Invocation
               ? buildAST(FileName,
                          std::make_unique<CompilerInvocation>(*Invocation),
-                         *CurrentInputs, getPossiblyStalePreamble())
+                         CompilerInvocationDiagConsumer.take(), *CurrentInputs,
+                         getPossiblyStalePreamble())
               : None;
       AST = NewAST ? std::make_unique<ParsedAST>(std::move(*NewAST)) : nullptr;
     }
diff --git a/clang-tools-extra/clangd/TUScheduler.h b/clang-tools-extra/clangd/TUScheduler.h
index d6f530a751d428..e02250d6e6f7ac 100644
--- a/clang-tools-extra/clangd/TUScheduler.h
+++ b/clang-tools-extra/clangd/TUScheduler.h
@@ -10,8 +10,10 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TUSCHEDULER_H
 
 #include "ClangdUnit.h"
+#include "Diagnostics.h"
 #include "Function.h"
 #include "GlobalCompilationDatabase.h"
+#include "Path.h"
 #include "Threading.h"
 #include "index/CanonicalIncludes.h"
 #include "llvm/ADT/Optional.h"
@@ -125,6 +127,11 @@ class ParsingCallbacks {
   /// Publish() may never run in this case).
   virtual void onMainAST(PathRef Path, ParsedAST &AST, PublishFn Publish) {}
 
+  /// Called whenever the AST fails to build. \p Diags will have the diagnostics
+  /// that led to failure.
+  virtual void onFailedAST(PathRef Path, std::vector<Diag> Diags,
+                           PublishFn Publish) {}
+
   /// Called whenever the TU status is updated.
   virtual void onFileUpdated(PathRef File, const TUStatus &Status) {}
 };
diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp
index 6d2360e62ad002..b58236ef7d8a7c 100644
--- a/clang-tools-extra/clangd/index/Background.cpp
+++ b/clang-tools-extra/clangd/index/Background.cpp
@@ -369,11 +369,11 @@ llvm::Error BackgroundIndex::index(tooling::CompileCommand Cmd) {
   Inputs.FS = std::move(FS);
   Inputs.FS->setCurrentWorkingDirectory(Cmd.Directory);
   Inputs.CompileCommand = std::move(Cmd);
-  auto CI = buildCompilerInvocation(Inputs);
+  IgnoreDiagnostics IgnoreDiags;
+  auto CI = buildCompilerInvocation(Inputs, IgnoreDiags);
   if (!CI)
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "Couldn't build compiler invocation");
-  IgnoreDiagnostics IgnoreDiags;
   auto Clang = prepareCompilerInstance(std::move(CI), /*Preamble=*/nullptr,
                                        std::move(*Buf), Inputs.FS, IgnoreDiags);
   if (!Clang)
diff --git a/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp b/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp
index 7fe57025dc7704..430a056c1ea1bd 100644
--- a/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ClangdUnitTests.cpp
@@ -10,6 +10,7 @@
 #include "Annotations.h"
 #include "ClangdUnit.h"
 #include "Compiler.h"
+#include "Diagnostics.h"
 #include "SourceCode.h"
 #include "TestFS.h"
 #include "TestTU.h"
@@ -252,12 +253,13 @@ TEST(ClangdUnitTest, CanBuildInvocationWithUnknownArgs) {
   Inputs.FS = buildTestFS({{testPath("foo.cpp"), "void test() {}"}});
   Inputs.CompileCommand.CommandLine = {"clang", "-fsome-unknown-flag",
                                        testPath("foo.cpp")};
-  EXPECT_NE(buildCompilerInvocation(Inputs), nullptr);
+  IgnoreDiagnostics IgnoreDiags;
+  EXPECT_NE(buildCompilerInvocation(Inputs, IgnoreDiags), nullptr);
 
   // Unknown forwarded to -cc1 should not a failure either.
   Inputs.CompileCommand.CommandLine = {
       "clang", "-Xclang", "-fsome-unknown-flag", testPath("foo.cpp")};
-  EXPECT_NE(buildCompilerInvocation(Inputs), nullptr);
+  EXPECT_NE(buildCompilerInvocation(Inputs, IgnoreDiags), nullptr);
 }
 
 } // namespace
diff --git a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
index f1f304f935e0c7..f7b5ecafb8adc1 100644
--- a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
@@ -9,6 +9,7 @@
 #include "AST.h"
 #include "Annotations.h"
 #include "ClangdUnit.h"
+#include "Compiler.h"
 #include "SyncAPI.h"
 #include "TestFS.h"
 #include "TestTU.h"
@@ -280,7 +281,8 @@ TEST(FileIndexTest, RebuildWithPreamble) {
   )cpp";
 
   // Rebuild the file.
-  auto CI = buildCompilerInvocation(PI);
+  IgnoreDiagnostics IgnoreDiags;
+  auto CI = buildCompilerInvocation(PI, IgnoreDiags);
 
   FileIndex Index;
   bool IndexUpdated = false;
diff --git a/clang-tools-extra/clangd/unittests/HeadersTests.cpp b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
index da701309e35cdb..d07312ca5884fe 100644
--- a/clang-tools-extra/clangd/unittests/HeadersTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
@@ -46,7 +46,7 @@ class HeadersTest : public ::testing::Test {
     ParseInputs PI;
     PI.CompileCommand = *Cmd;
     PI.FS = VFS;
-    auto CI = buildCompilerInvocation(PI);
+    auto CI = buildCompilerInvocation(PI, IgnoreDiags);
     EXPECT_TRUE(static_cast<bool>(CI));
     // The diagnostic options must be set before creating a CompilerInstance.
     CI->getDiagnosticOpts().IgnoreWarnings = true;
diff --git a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
index b605c940360f5d..274c07f99cd701 100644
--- a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
@@ -14,6 +14,9 @@
 #include "Path.h"
 #include "TUScheduler.h"
 #include "TestFS.h"
+#include "Threading.h"
+#include "clang/Basic/DiagnosticDriver.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "gmock/gmock.h"
@@ -28,6 +31,9 @@ namespace {
 using ::testing::AnyOf;
 using ::testing::Each;
 using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Field;
+using ::testing::IsEmpty;
 using ::testing::Pointee;
 using ::testing::UnorderedElementsAre;
 
@@ -60,12 +66,22 @@ class TUSchedulerTests : public ::testing::Test {
   /// in updateWithDiags.
   static std::unique_ptr<ParsingCallbacks> captureDiags() {
     class CaptureDiags : public ParsingCallbacks {
+    public:
       void onMainAST(PathRef File, ParsedAST &AST, PublishFn Publish) override {
-        auto Diags = AST.getDiagnostics();
+        reportDiagnostics(File, AST.getDiagnostics(), Publish);
+      }
+
+      void onFailedAST(PathRef File, std::vector<Diag> Diags,
+                       PublishFn Publish) override {
+        reportDiagnostics(File, Diags, Publish);
+      }
+
+    private:
+      void reportDiagnostics(PathRef File, llvm::ArrayRef<Diag> Diags,
+                             PublishFn Publish) {
         auto D = Context::current().get(DiagsCallbackKey);
         if (!D)
           return;
-
         Publish([&]() {
           const_cast<
               llvm::unique_function<void(PathRef, std::vector<Diag>)> &> (*D)(
@@ -720,6 +736,53 @@ TEST_F(TUSchedulerTests, TUStatus) {
                   TUState(TUAction::Idle, /*No action*/ "")));
 }
 
+TEST_F(TUSchedulerTests, CommandLineErrors) {
+  // We should see errors from command-line parsing inside the main file.
+  CDB.ExtraClangFlags = {"-fsome-unknown-flag"};
+
+  TUScheduler S(CDB, /*AsyncThreadsCount=*/getDefaultAsyncThreadsCount(),
+                /*StorePreambleInMemory=*/true, /*ASTCallbacks=*/captureDiags(),
+                /*UpdateDebounce=*/std::chrono::steady_clock::duration::zero(),
+                ASTRetentionPolicy());
+
+  Notification Ready;
+  std::vector<Diag> Diagnostics;
+  updateWithDiags(S, testPath("foo.cpp"), "void test() {}",
+                  WantDiagnostics::Yes, [&](std::vector<Diag> D) {
+                    Diagnostics = std::move(D);
+                    Ready.notify();
+                  });
+  Ready.wait();
+
+  EXPECT_THAT(
+      Diagnostics,
+      ElementsAre(AllOf(
+          Field(&Diag::ID, Eq(diag::err_drv_unknown_argument)),
+          Field(&Diag::Name, Eq("drv_unknown_argument")),
+          Field(&Diag::Message, "unknown argument: '-fsome-unknown-flag'"))));
+}
+
+TEST_F(TUSchedulerTests, CommandLineWarnings) {
+  // We should not see warnings from command-line parsing.
+  CDB.ExtraClangFlags = {"-Wsome-unknown-warning"};
+
+  TUScheduler S(CDB, /*AsyncThreadsCount=*/getDefaultAsyncThreadsCount(),
+                /*StorePreambleInMemory=*/true, /*ASTCallbacks=*/captureDiags(),
+                /*UpdateDebounce=*/std::chrono::steady_clock::duration::zero(),
+                ASTRetentionPolicy());
+
+  Notification Ready;
+  std::vector<Diag> Diagnostics;
+  updateWithDiags(S, testPath("foo.cpp"), "void test() {}",
+                  WantDiagnostics::Yes, [&](std::vector<Diag> D) {
+                    Diagnostics = std::move(D);
+                    Ready.notify();
+                  });
+  Ready.wait();
+
+  EXPECT_THAT(Diagnostics, IsEmpty());
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/TestTU.cpp b/clang-tools-extra/clangd/unittests/TestTU.cpp
index 0c1727eccad6cf..75393f1415b17f 100644
--- a/clang-tools-extra/clangd/unittests/TestTU.cpp
+++ b/clang-tools-extra/clangd/unittests/TestTU.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "TestTU.h"
+#include "Compiler.h"
+#include "Diagnostics.h"
 #include "TestFS.h"
 #include "index/FileIndex.h"
 #include "index/MemIndex.h"
@@ -59,14 +61,16 @@ ParsedAST TestTU::build() const {
   Inputs.Index = ExternalIndex;
   if (Inputs.Index)
     Inputs.Opts.SuggestMissingIncludes = true;
-  auto CI = buildCompilerInvocation(Inputs);
+  StoreDiags Diags;
+  auto CI = buildCompilerInvocation(Inputs, Diags);
   assert(CI && "Failed to build compilation invocation.");
   auto Preamble =
       buildPreamble(FullFilename, *CI,
                     /*OldPreamble=*/nullptr,
                     /*OldCompileCommand=*/Inputs.CompileCommand, Inputs,
                     /*StoreInMemory=*/true, /*PreambleCallback=*/nullptr);
-  auto AST = buildAST(FullFilename, std::move(CI), Inputs, Preamble);
+  auto AST =
+      buildAST(FullFilename, std::move(CI), Diags.take(), Inputs, Preamble);
   if (!AST.hasValue()) {
     ADD_FAILURE() << "Failed to build code:\n" << Code;
     llvm_unreachable("Failed to build TestTU!");
diff --git a/clang/.gitattributes b/clang/.gitattributes
index 1f6a5a1132903e..b48a3e3911adba 100644
--- a/clang/.gitattributes
+++ b/clang/.gitattributes
@@ -1,4 +1,3 @@
 # Windows line ending tests
 test/Lexer/minimize_source_to_dependency_directives_invalid_error.c text eol=crlf
 test/FixIt/fixit-newline-style.c text eol=crlf
-test/Frontend/system-header-line-directive-ms-lineendings.c text eol=crlf
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c b/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c
deleted file mode 100644
index c4a4cf3d97526e..00000000000000
--- a/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c
+++ /dev/null
@@ -1,16 +0,0 @@
-// Test CF+LF are properly handled along with quoted, multi-line #error
-// RUN: %clang_cc1 -DOTHER -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s
-
-#ifndef TEST
-#error "message \
-   more message \
-   even more"
-#endif
-
-#ifdef OTHER
-#include <string>
-#endif
-
-// CHECK:      #ifdef OTHER
-// CHECK-NEXT: #include <string>
-// CHECK-NEXT: #endif
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 3a372f5736c640..2bd9dc2d34166d 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -3506,23 +3506,6 @@ bool PPC64LongBranchTargetSection::isNeeded() const {
   return !finalized || !entries.empty();
 }
 
-RISCVSdataSection::RISCVSdataSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 1, ".sdata") {}
-
-bool RISCVSdataSection::isNeeded() const {
-  if (!ElfSym::riscvGlobalPointer)
-    return false;
-
-  // __global_pointer$ is defined relative to .sdata . If the section does not
-  // exist, create a dummy one.
-  for (BaseCommand *base : getParent()->sectionCommands)
-    if (auto *isd = dyn_cast<InputSectionDescription>(base))
-      for (InputSection *isec : isd->sections)
-        if (isec != this)
-          return false;
-  return true;
-}
-
 static uint8_t getAbiVersion() {
   // MIPS non-PIC executable gets ABI version 1.
   if (config->emachine == EM_MIPS) {
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 70ec36c4420d80..6846397895066e 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -1100,15 +1100,6 @@ class PartitionIndexSection : public SyntheticSection {
   void writeTo(uint8_t *buf) override;
 };
 
-// Create a dummy .sdata for __global_pointer$ if .sdata does not exist.
-class RISCVSdataSection final : public SyntheticSection {
-public:
-  RISCVSdataSection();
-  size_t getSize() const override { return 0; }
-  bool isNeeded() const override;
-  void writeTo(uint8_t *buf) override {}
-};
-
 InputSection *createInterpSection();
 MergeInputSection *createCommentSection();
 template <class ELFT> void splitSections();
@@ -1173,7 +1164,6 @@ struct InStruct {
   PltSection *plt;
   PltSection *iplt;
   PPC32Got2Section *ppc32Got2;
-  RISCVSdataSection *riscvSdata;
   RelocationBaseSection *relaPlt;
   RelocationBaseSection *relaIplt;
   StringTableSection *shStrTab;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 1be73d6011f94d..4de22eff38e9ae 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -474,11 +474,6 @@ template <class ELFT> static void createSyntheticSections() {
     add(in.ppc64LongBranchTarget);
   }
 
-  if (config->emachine == EM_RISCV) {
-    in.riscvSdata = make<RISCVSdataSection>();
-    add(in.riscvSdata);
-  }
-
   in.gotPlt = make<GotPltSection>();
   add(in.gotPlt);
   in.igotPlt = make<IgotPltSection>();
@@ -1701,12 +1696,16 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   // Define __rel[a]_iplt_{start,end} symbols if needed.
   addRelIpltSymbols();
 
-  // RISC-V's gp can address +/- 2 KiB, set it to .sdata + 0x800 if not defined.
-  // This symbol should only be defined in an executable.
-  if (config->emachine == EM_RISCV && !config->shared)
+  // RISC-V's gp can address +/- 2 KiB, set it to .sdata + 0x800. This symbol
+  // should only be defined in an executable. If .sdata does not exist, its
+  // value/section does not matter but it has to be relative, so set its
+  // st_shndx arbitrarily to 1 (Out::elfHeader).
+  if (config->emachine == EM_RISCV && !config->shared) {
+    OutputSection *sec = findSection(".sdata");
     ElfSym::riscvGlobalPointer =
-        addOptionalRegular("__global_pointer$", findSection(".sdata"), 0x800,
-                           STV_DEFAULT, STB_GLOBAL);
+        addOptionalRegular("__global_pointer$", sec ? sec : Out::elfHeader,
+                           0x800, STV_DEFAULT, STB_GLOBAL);
+  }
 
   if (config->emachine == EM_X86_64) {
     // On targets that support TLSDESC, _TLS_MODULE_BASE_ is defined in such a
@@ -1881,7 +1880,6 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   finalizeSynthetic(in.plt);
   finalizeSynthetic(in.iplt);
   finalizeSynthetic(in.ppc32Got2);
-  finalizeSynthetic(in.riscvSdata);
   finalizeSynthetic(in.partIndex);
 
   // Dynamic section must be the last one in this list and dynamic
@@ -2226,10 +2224,9 @@ template <class ELFT> void Writer<ELFT>::fixSectionAlignments() {
       // and non-executable segments.
       //
       // TODO Enable this technique on all targets.
-      bool enable = config->emachine == EM_386 ||
-                    config->emachine == EM_AARCH64 ||
-                    config->emachine == EM_ARM || config->emachine == EM_PPC ||
-                    config->emachine == EM_PPC64;
+      bool enable =
+          config->emachine != EM_HEXAGON && config->emachine != EM_MIPS &&
+          config->emachine != EM_RISCV && config->emachine != EM_X86_64;
 
       if (!enable || (config->zSeparateCode && prev &&
                       (prev->p_flags & PF_X) != (p->p_flags & PF_X)))
diff --git a/lld/test/ELF/amdgpu-relocs.s b/lld/test/ELF/amdgpu-relocs.s
index 88b5c5ec270fc1..caee617cf85e78 100644
--- a/lld/test/ELF/amdgpu-relocs.s
+++ b/lld/test/ELF/amdgpu-relocs.s
@@ -94,7 +94,7 @@ foo:
 # linker.
 # CHECK: Relocations [
 # CHECK: .rela.dyn {
-# CHECK-NEXT: R_AMDGPU_RELATIVE64 - 0x3008
+# CHECK-NEXT: R_AMDGPU_RELATIVE64 - 0x3928
 # CHECK-NEXT: R_AMDGPU_ABS64 common_var0 0x0
 # CHECK-NEXT: R_AMDGPU_ABS64 common_var1 0x0
 # CHECK-NEXT: R_AMDGPU_ABS64 common_var2 0x0
@@ -114,16 +114,16 @@ foo:
 # CHECK-NEXT: }
 # CHECK-NEXT: ]
 
-# NM: 0000000000003010 B common_var0
-# NM: 0000000000003410 B common_var1
-# NM: 0000000000003810 B common_var2
-# NM: 0000000000003008 d temp2
+# NM: 0000000000003930 B common_var0
+# NM: 0000000000003d30 B common_var1
+# NM: 0000000000004130 B common_var2
+# NM: 0000000000003928 d temp2
 
-# temp2 - foo = 0x3008-0x768 = 0x28a0
+# temp2 - foo = 0x3928-0x768 = 0x31c0
 # HEX:      section '.rodata':
-# HEX-NEXT: 0x00000768 a0280000 00000000
+# HEX-NEXT: 0x00000768 c0310000 00000000
 
 # common_var2+4, common_var1+8, and common_var0+12.
 # HEX:      section 'nonalloc':
-# HEX-NEXT: 0x00000000 00000000 14380000 00000000 18340000
-# HEX-NEXT: 0x00000010 00000000 1c300000
+# HEX-NEXT: 0x00000000 00000000 34410000 00000000 383d0000
+# HEX-NEXT: 0x00000010 00000000 3c390000
diff --git a/lld/test/ELF/basic-sparcv9.s b/lld/test/ELF/basic-sparcv9.s
index 031ce7b1e8fdfc..820dba556f5a12 100644
--- a/lld/test/ELF/basic-sparcv9.s
+++ b/lld/test/ELF/basic-sparcv9.s
@@ -26,7 +26,7 @@ _start:
 # CHECK-NEXT:   Version: 1
 # CHECK-NEXT:   Entry: [[ENTRY:0x[0-9A-F]+]]
 # CHECK-NEXT:   ProgramHeaderOffset: 0x40
-# CHECK-NEXT:   SectionHeaderOffset: 0x100080
+# CHECK-NEXT:   SectionHeaderOffset: 0x1A0
 # CHECK-NEXT:   Flags [ (0x0)
 # CHECK-NEXT:   ]
 # CHECK-NEXT:   HeaderSize: 64
@@ -59,8 +59,8 @@ _start:
 # CHECK-NEXT:       SHF_ALLOC (0x2)
 # CHECK-NEXT:       SHF_EXECINSTR (0x4)
 # CHECK-NEXT:     ]
-# CHECK-NEXT:     Address: 0x200000
-# CHECK-NEXT:     Offset: 0x100000
+# CHECK-NEXT:     Address: 0x200120
+# CHECK-NEXT:     Offset: 0x120
 # CHECK-NEXT:     Size: 12
 # CHECK-NEXT:     Link: 0
 # CHECK-NEXT:     Info: 0
@@ -76,7 +76,7 @@ _start:
 # CHECK-NEXT:       SHF_STRINGS (0x20)
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
-# CHECK-NEXT:     Offset: 0x10000C
+# CHECK-NEXT:     Offset: 0x12C
 # CHECK-NEXT:     Size: 8
 # CHECK-NEXT:     Link: 0
 # CHECK-NEXT:     Info: 0
@@ -90,7 +90,7 @@ _start:
 # CHECK-NEXT:     Flags [ (0x0)
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
-# CHECK-NEXT:     Offset: 0x100018
+# CHECK-NEXT:     Offset: 0x138
 # CHECK-NEXT:     Size: 48
 # CHECK-NEXT:     Link: 5
 # CHECK-NEXT:     Info: 1
@@ -104,7 +104,7 @@ _start:
 # CHECK-NEXT:     Flags [ (0x0)
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
-# CHECK-NEXT:     Offset: 0x100048
+# CHECK-NEXT:     Offset: 0x168
 # CHECK-NEXT:     Size: 42
 # CHECK-NEXT:     Link: 0
 # CHECK-NEXT:     Info: 0
@@ -118,7 +118,7 @@ _start:
 # CHECK-NEXT:     Flags [ (0x0)
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
-# CHECK-NEXT:     Offset: 0x100072
+# CHECK-NEXT:     Offset: 0x192
 # CHECK-NEXT:     Size: 8
 # CHECK-NEXT:     Link: 0
 # CHECK-NEXT:     Info: 0
@@ -150,8 +150,8 @@ _start:
 # CHECK-NEXT:   ProgramHeader {
 # CHECK-NEXT:     Type: PT_PHDR (0x6)
 # CHECK-NEXT:     Offset: 0x40
-# CHECK-NEXT:     VirtualAddress: 0x100040
-# CHECK-NEXT:     PhysicalAddress: 0x100040
+# CHECK-NEXT:     VirtualAddress: 0x200040
+# CHECK-NEXT:     PhysicalAddress: 0x200040
 # CHECK-NEXT:     FileSize: 224
 # CHECK-NEXT:     MemSize: 224
 # CHECK-NEXT:     Flags [ (0x4)
@@ -162,8 +162,8 @@ _start:
 # CHECK-NEXT:   ProgramHeader {
 # CHECK-NEXT:     Type: PT_LOAD (0x1)
 # CHECK-NEXT:     Offset: 0x0
-# CHECK-NEXT:     VirtualAddress: 0x100000
-# CHECK-NEXT:     PhysicalAddress: 0x100000
+# CHECK-NEXT:     VirtualAddress: 0x200000
+# CHECK-NEXT:     PhysicalAddress: 0x200000
 # CHECK-NEXT:     FileSize: 288
 # CHECK-NEXT:     MemSize: 288
 # CHECK-NEXT:     Flags [
@@ -173,9 +173,9 @@ _start:
 # CHECK-NEXT:   }
 # CHECK-NEXT:   ProgramHeader {
 # CHECK-NEXT:     Type: PT_LOAD (0x1)
-# CHECK-NEXT:     Offset: 0x100000
-# CHECK-NEXT:     VirtualAddress: 0x200000
-# CHECK-NEXT:     PhysicalAddress: 0x200000
+# CHECK-NEXT:     Offset: 0x120
+# CHECK-NEXT:     VirtualAddress: 0x200120
+# CHECK-NEXT:     PhysicalAddress: 0x200120
 # CHECK-NEXT:     FileSize: 12
 # CHECK-NEXT:     MemSize: 12
 # CHECK-NEXT:     Flags [ (0x5)
diff --git a/lld/test/ELF/riscv-gp-dummy-sdata.s b/lld/test/ELF/riscv-gp-dummy-sdata.s
deleted file mode 100644
index e04b170d5b2b97..00000000000000
--- a/lld/test/ELF/riscv-gp-dummy-sdata.s
+++ /dev/null
@@ -1,25 +0,0 @@
-# REQUIRES: riscv
-# RUN: llvm-mc -filetype=obj -triple=riscv32 %s -o %t.32.o
-# RUN: ld.lld -pie %t.32.o -o %t.32
-# RUN: llvm-readelf -S %t.32 | FileCheck --check-prefix=SEC %s
-# RUN: llvm-readelf -s %t.32 | FileCheck --check-prefix=SYM %s
-
-# RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.64.o
-# RUN: ld.lld -pie %t.64.o -o %t.64
-# RUN: llvm-readelf -S %t.64 | FileCheck --check-prefix=SEC %s
-# RUN: llvm-readelf -s %t.64 | FileCheck --check-prefix=SYM %s
-
-## If there is an undefined reference to __global_pointer$ but .sdata doesn't
-## exist, create a dummy one.
-
-## __global_pointer$ = .sdata+0x800
-# SEC: [ 7] .sdata PROGBITS {{0*}}00003000
-# SYM: {{0*}}00003800 0 NOTYPE GLOBAL DEFAULT 7 __global_pointer$
-
-## If __global_pointer$ is not used, don't create .sdata .
-
-# RUN: llvm-mc -filetype=obj -triple=riscv32 /dev/null -o %t.32.o
-# RUN: ld.lld -pie %t.32.o -o %t.32
-# RUN: llvm-readelf -S %t.32 | FileCheck --implicit-check-not=.sdata /dev/null
-
-lla gp, __global_pointer$
diff --git a/lld/test/ELF/riscv-gp-no-sdata.s b/lld/test/ELF/riscv-gp-no-sdata.s
new file mode 100644
index 00000000000000..ee86438ec4f350
--- /dev/null
+++ b/lld/test/ELF/riscv-gp-no-sdata.s
@@ -0,0 +1,15 @@
+# REQUIRES: riscv
+# RUN: llvm-mc -filetype=obj -triple=riscv32 %s -o %t.32.o
+# RUN: ld.lld -pie %t.32.o -o %t.32
+# RUN: llvm-readelf -s %t.32 | FileCheck --check-prefix=SYM %s
+
+# RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.64.o
+# RUN: ld.lld -pie %t.64.o -o %t.64
+# RUN: llvm-readelf -s %t.64 | FileCheck --check-prefix=SYM %s
+
+## If there is an undefined reference to __global_pointer$ but .sdata doesn't
+## exist, define __global_pointer$ and set its st_shndx arbitrarily to 1.
+
+# SYM: {{0*}}00000800 0 NOTYPE GLOBAL DEFAULT 1 __global_pointer$
+
+lla gp, __global_pointer$
diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index 28635f820170f1..e526c1db26b229 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -228,25 +228,15 @@ class CommandObject {
   ///
   /// \param[in/out] request
   ///    The completion request that needs to be answered.
-  ///
-  /// FIXME: This is the wrong return value, since we also need to make a
-  /// distinction between
-  /// total number of matches, and the window the user wants returned.
   virtual void HandleCompletion(CompletionRequest &request);
 
-  /// The input array contains a parsed version of the line.  The insertion
-  /// point is given by cursor_index (the index in input of the word containing
-  /// the cursor) and cursor_char_position (the position of the cursor in that
-  /// word.)
+  /// The input array contains a parsed version of the line.
+  ///
   /// We've constructed the map of options and their arguments as well if that
   /// is helpful for the completion.
   ///
   /// \param[in/out] request
   ///    The completion request that needs to be answered.
-  ///
-  /// FIXME: This is the wrong return value, since we also need to make a
-  /// distinction between
-  /// total number of matches, and the window the user wants returned.
   virtual void
   HandleArgumentCompletion(CompletionRequest &request,
                            OptionElementVector &opt_element_vector) {}
diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp
index 6c528b119fe1ed..2829a22b7e497c 100644
--- a/lldb/source/Interpreter/Options.cpp
+++ b/lldb/source/Interpreter/Options.cpp
@@ -652,8 +652,7 @@ bool Options::HandleOptionCompletion(CompletionRequest &request,
 
   auto opt_defs = GetDefinitions();
 
-  std::string cur_opt_std_str = request.GetCursorArgumentPrefix().str();
-  const char *cur_opt_str = cur_opt_std_str.c_str();
+  llvm::StringRef cur_opt_str = request.GetCursorArgumentPrefix();
 
   for (size_t i = 0; i < opt_element_vector.size(); i++) {
     int opt_pos = opt_element_vector[i].opt_pos;
@@ -667,7 +666,7 @@ bool Options::HandleOptionCompletion(CompletionRequest &request,
         // FIXME: We should scan the other options provided and only complete
         // options
         // within the option group they belong to.
-        char opt_str[3] = {'-', 'a', '\0'};
+        std::string opt_str = "-a";
 
         for (auto &def : opt_defs) {
           if (!def.short_option)
@@ -685,7 +684,7 @@ bool Options::HandleOptionCompletion(CompletionRequest &request,
 
           full_name.erase(full_name.begin() + 2, full_name.end());
           full_name.append(def.long_option);
-          request.AddCompletion(full_name.c_str());
+          request.AddCompletion(full_name);
         }
         return true;
       } else if (opt_defs_index != OptionArgElement::eUnrecognizedArg) {
@@ -693,17 +692,13 @@ bool Options::HandleOptionCompletion(CompletionRequest &request,
         // anyway (getopt_long_only is happy with shortest unique string, but
         // it's still a nice thing to do.)  Otherwise return The string so the
         // upper level code will know this is a full match and add the " ".
-        if (cur_opt_str && strlen(cur_opt_str) > 2 && cur_opt_str[0] == '-' &&
-            cur_opt_str[1] == '-' &&
-            strcmp(opt_defs[opt_defs_index].long_option, cur_opt_str) != 0) {
-          std::string full_name("--");
-          full_name.append(opt_defs[opt_defs_index].long_option);
-          request.AddCompletion(full_name.c_str());
+        llvm::StringRef long_option = opt_defs[opt_defs_index].long_option;
+        if (cur_opt_str.startswith("--") && cur_opt_str != long_option) {
+          request.AddCompletion("--" + long_option.str());
           return true;
-        } else {
+        } else
           request.AddCompletion(request.GetCursorArgument());
-          return true;
-        }
+        return true;
       } else {
         // FIXME - not handling wrong options yet:
         // Check to see if they are writing a long option & complete it.
@@ -712,16 +707,15 @@ bool Options::HandleOptionCompletion(CompletionRequest &request,
         // that are not unique up to this point.  getopt_long_only does
         // shortest unique match for long options already.
 
-        if (cur_opt_str && strlen(cur_opt_str) > 2 && cur_opt_str[0] == '-' &&
-            cur_opt_str[1] == '-') {
+        if (cur_opt_str.startswith("--")) {
           for (auto &def : opt_defs) {
             if (!def.long_option)
               continue;
 
-            if (strstr(def.long_option, cur_opt_str + 2) == def.long_option) {
+            if (cur_opt_str.startswith(def.long_option)) {
               std::string full_name("--");
               full_name.append(def.long_option);
-              request.AddCompletion(full_name.c_str());
+              request.AddCompletion(full_name);
             }
           }
         }
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index b4dc91bc3f3412..c8f794b4cc3c90 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -140,7 +140,16 @@ unsigned LLVMGetLastEnumAttributeKind(void) {
 
 LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
                                          uint64_t Val) {
-  return wrap(Attribute::get(*unwrap(C), (Attribute::AttrKind)KindID, Val));
+  auto &Ctx = *unwrap(C);
+  auto AttrKind = (Attribute::AttrKind)KindID;
+
+  if (AttrKind == Attribute::AttrKind::ByVal) {
+    // After r362128, byval attributes need to have a type attribute. Provide a
+    // NULL one until a proper API is added for this.
+    return wrap(Attribute::getWithByValType(Ctx, NULL));
+  } else {
+    return wrap(Attribute::get(Ctx, AttrKind, Val));
+  }
 }
 
 unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) {
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 5717a7102b692e..212c5a397b85d9 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -1,4 +1,4 @@
-//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
+//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -18,13 +18,10 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
@@ -61,6 +58,7 @@ namespace {
     Value*        RHS;
     bool          Exchange = false;
     bool          ReadOnly = true;
+    bool          Paired = false;
     SmallVector<LoadInst*, 2> VecLd;    // Container for loads to widen.
 
     MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
@@ -71,7 +69,7 @@ namespace {
     }
 
     LoadInst *getBaseLoad() const {
-      return cast<LoadInst>(LHS);
+      return VecLd.front();
     }
   };
 
@@ -82,7 +80,7 @@ namespace {
     Value           *Acc = nullptr;
     MulCandList     Muls;
     MulPairList        MulPairs;
-    SmallPtrSet<Instruction*, 4> Adds;
+    SetVector<Instruction*> Adds;
 
   public:
     Reduction() = delete;
@@ -92,10 +90,35 @@ namespace {
     /// Record an Add instruction that is a part of the this reduction.
     void InsertAdd(Instruction *I) { Adds.insert(I); }
 
-    /// Record a MulCandidate, rooted at a Mul instruction, that is a part of
-    /// this reduction.
-    void InsertMul(Instruction *I, Value *LHS, Value *RHS) {
-      Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
+    /// Create MulCandidates, each rooted at a Mul instruction, that is a part
+    /// of this reduction.
+    void InsertMuls() {
+      auto GetMulOperand = [](Value *V) -> Instruction* {
+        if (auto *SExt = dyn_cast<SExtInst>(V)) {
+          if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
+            if (I->getOpcode() == Instruction::Mul)
+              return I;
+        } else if (auto *I = dyn_cast<Instruction>(V)) {
+          if (I->getOpcode() == Instruction::Mul)
+            return I;
+        }
+        return nullptr;
+      };
+
+      auto InsertMul = [this](Instruction *I) {
+        Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
+        Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
+        Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
+      };
+
+      for (auto *Add : Adds) {
+        if (Add == Acc)
+          continue;
+        if (auto *Mul = GetMulOperand(Add->getOperand(0)))
+          InsertMul(Mul);
+        if (auto *Mul = GetMulOperand(Add->getOperand(1)))
+          InsertMul(Mul);
+      }
     }
 
     /// Add the incoming accumulator value, returns true if a value had not
@@ -110,7 +133,15 @@ namespace {
 
     /// Set two MulCandidates, rooted at muls, that can be executed as a single
     /// parallel operation.
-    void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1) {
+    void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
+                    bool Exchange = false) {
+      LLVM_DEBUG(dbgs() << "Pairing:\n"
+                 << *Mul0->Root << "\n"
+                 << *Mul1->Root << "\n");
+      Mul0->Paired = true;
+      Mul1->Paired = true;
+      if (Exchange)
+        Mul1->Exchange = true;
       MulPairs.push_back(std::make_pair(Mul0, Mul1));
     }
 
@@ -127,7 +158,7 @@ namespace {
     Value *getAccumulator() { return Acc; }
 
     /// Return the set of adds that comprise the reduction.
-    SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
+    SetVector<Instruction*> &getAdds() { return Adds; }
 
     /// Return the MulCandidate, rooted at mul instruction, that comprise the
     /// the reduction.
@@ -141,6 +172,18 @@ namespace {
     void UpdateRoot(Instruction *SMLAD) {
       Root->replaceAllUsesWith(SMLAD);
     }
+
+    void dump() {
+      LLVM_DEBUG(dbgs() << "Reduction:\n";
+        for (auto *Add : Adds)
+          LLVM_DEBUG(dbgs() << *Add << "\n");
+        for (auto &Mul : Muls)
+          LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
+                     << "  " << *Mul->LHS << "\n"
+                     << "  " << *Mul->RHS << "\n");
+        LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
+      );
+    }
   };
 
   class WidenedLoad {
@@ -158,13 +201,11 @@ namespace {
     }
   };
 
-  class ARMParallelDSP : public LoopPass {
+  class ARMParallelDSP : public FunctionPass {
     ScalarEvolution   *SE;
     AliasAnalysis     *AA;
     TargetLibraryInfo *TLI;
     DominatorTree     *DT;
-    LoopInfo          *LI;
-    Loop              *L;
     const DataLayout  *DL;
     Module            *M;
     std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -172,8 +213,8 @@ namespace {
     std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
 
     template<unsigned>
-    bool IsNarrowSequence(Value *V, Value *&Src);
-
+    bool IsNarrowSequence(Value *V);
+    bool Search(Value *V, BasicBlock *BB, Reduction &R);
     bool RecordMemoryOps(BasicBlock *BB);
     void InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
@@ -185,63 +226,38 @@ namespace {
     /// products to a 32-bit accumulate operand. Optionally, the instruction can
     /// exchange the halfwords of the second operand before performing the
     /// arithmetic.
-    bool MatchSMLAD(Loop *L);
+    bool MatchSMLAD(Function &F);
 
   public:
     static char ID;
 
-    ARMParallelDSP() : LoopPass(ID) { }
-
-    bool doInitialization(Loop *L, LPPassManager &LPM) override {
-      LoadPairs.clear();
-      WideLoads.clear();
-      return true;
-    }
+    ARMParallelDSP() : FunctionPass(ID) { }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      LoopPass::getAnalysisUsage(AU);
+      FunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<TargetPassConfig>();
-      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
       AU.setPreservesCFG();
     }
 
-    bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+    bool runOnFunction(Function &F) override {
       if (DisableParallelDSP)
         return false;
-      if (skipLoop(TheLoop))
+      if (skipFunction(F))
         return false;
 
-      L = TheLoop;
       SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
       AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
       TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
       auto &TPC = getAnalysis<TargetPassConfig>();
 
-      BasicBlock *Header = TheLoop->getHeader();
-      if (!Header)
-        return false;
-
-      // TODO: We assume the loop header and latch to be the same block.
-      // This is not a fundamental restriction, but lifting this would just
-      // require more work to do the transformation and then patch up the CFG.
-      if (Header != TheLoop->getLoopLatch()) {
-        LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
-                             "running pass ARMParallelDSP\n");
-        return false;
-      }
-
-      if (!TheLoop->getLoopPreheader())
-        InsertPreheaderForLoop(L, DT, LI, nullptr, true);
-
-      Function &F = *Header->getParent();
       M = F.getParent();
       DL = &M->getDataLayout();
 
@@ -266,17 +282,10 @@ namespace {
         return false;
       }
 
-      LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
-
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
 
-      if (!RecordMemoryOps(Header)) {
-        LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
-        return false;
-      }
-
-      bool Changes = MatchSMLAD(L);
+      bool Changes = MatchSMLAD(F);
       return Changes;
     }
   };
@@ -315,18 +324,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
 // TODO: we currently only collect i16, and will support i8 later, so that's
 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
 template<unsigned MaxBitWidth>
-bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) {
+bool ARMParallelDSP::IsNarrowSequence(Value *V) {
   if (auto *SExt = dyn_cast<SExtInst>(V)) {
     if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
       return false;
 
     if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
-      // Check that these load could be paired.
-      if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
-        return false;
-
-      Src = Ld;
-      return true;
+      // Check that this load could be paired.
+      return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
     }
   }
   return false;
@@ -337,6 +342,8 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) {
 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   SmallVector<LoadInst*, 8> Loads;
   SmallVector<Instruction*, 8> Writes;
+  LoadPairs.clear();
+  WideLoads.clear();
 
   // Collect loads and instruction that may write to memory. For now we only
   // record loads which are simple, sign-extended and have a single user.
@@ -414,7 +421,54 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   return LoadPairs.size() > 1;
 }
 
-// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// Search recursively back through the operands to find a tree of values that
+// form a multiply-accumulate chain. The search records the Add and Mul
+// instructions that form the reduction and allows us to find a single value
+// to be used as the initial input to the accumlator.
+bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
+  // If we find a non-instruction, try to use it as the initial accumulator
+  // value. This may have already been found during the search in which case
+  // this function will return false, signaling a search fail.
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return R.InsertAcc(V);
+
+  if (I->getParent() != BB)
+    return false;
+
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::PHI:
+    // Could be the accumulator value.
+    return R.InsertAcc(V);
+  case Instruction::Add: {
+    // Adds should be adding together two muls, or another add and a mul to
+    // be within the mac chain. One of the operands may also be the
+    // accumulator value at which point we should stop searching.
+    R.InsertAdd(I);
+    Value *LHS = I->getOperand(0);
+    Value *RHS = I->getOperand(1);
+    bool ValidLHS = Search(LHS, BB, R);
+    bool ValidRHS = Search(RHS, BB, R);
+
+    if (ValidLHS && ValidRHS)
+      return true;
+
+    return R.InsertAcc(I);
+  }
+  case Instruction::Mul: {
+    Value *MulOp0 = I->getOperand(0);
+    Value *MulOp1 = I->getOperand(1);
+    return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
+  }
+  case Instruction::SExt:
+    return Search(I->getOperand(0), BB, R);
+  }
+  return false;
+}
+
+// The pass needs to identify integer add/sub reductions of 16-bit vector
 // multiplications.
 // To use SMLAD:
 // 1) we first need to find integer add then look for this pattern:
@@ -445,88 +499,39 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
 // If loop invariants are used instead of loads, these need to be packed
 // before the loop begins.
 //
-bool ARMParallelDSP::MatchSMLAD(Loop *L) {
-  // Search recursively back through the operands to find a tree of values that
-  // form a multiply-accumulate chain. The search records the Add and Mul
-  // instructions that form the reduction and allows us to find a single value
-  // to be used as the initial input to the accumlator.
-  std::function<bool(Value*, Reduction&)> Search = [&]
-    (Value *V, Reduction &R) -> bool {
-
-    // If we find a non-instruction, try to use it as the initial accumulator
-    // value. This may have already been found during the search in which case
-    // this function will return false, signaling a search fail.
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      return R.InsertAcc(V);
-
-    switch (I->getOpcode()) {
-    default:
-      break;
-    case Instruction::PHI:
-      // Could be the accumulator value.
-      return R.InsertAcc(V);
-    case Instruction::Add: {
-      // Adds should be adding together two muls, or another add and a mul to
-      // be within the mac chain. One of the operands may also be the
-      // accumulator value at which point we should stop searching.
-      bool ValidLHS = Search(I->getOperand(0), R);
-      bool ValidRHS = Search(I->getOperand(1), R);
-      if (!ValidLHS && !ValidLHS)
-        return false;
-      else if (ValidLHS && ValidRHS) {
-        R.InsertAdd(I);
-        return true;
-      } else {
-        R.InsertAdd(I);
-        return R.InsertAcc(I);
-      }
-    }
-    case Instruction::Mul: {
-      Value *MulOp0 = I->getOperand(0);
-      Value *MulOp1 = I->getOperand(1);
-      if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
-        Value *LHS = nullptr;
-        Value *RHS = nullptr;
-        if (IsNarrowSequence<16>(MulOp0, LHS) &&
-            IsNarrowSequence<16>(MulOp1, RHS)) {
-          R.InsertMul(I, LHS, RHS);
-          return true;
-        }
-      }
-      return false;
-    }
-    case Instruction::SExt:
-      return Search(I->getOperand(0), R);
-    }
-    return false;
-  };
-
+bool ARMParallelDSP::MatchSMLAD(Function &F) {
   bool Changed = false;
-  SmallPtrSet<Instruction*, 4> AllAdds;
-  BasicBlock *Latch = L->getLoopLatch();
 
-  for (Instruction &I : reverse(*Latch)) {
-    if (I.getOpcode() != Instruction::Add)
+  for (auto &BB : F) {
+    SmallPtrSet<Instruction*, 4> AllAdds;
+    if (!RecordMemoryOps(&BB))
       continue;
 
-    if (AllAdds.count(&I))
-      continue;
+    for (Instruction &I : reverse(BB)) {
+      if (I.getOpcode() != Instruction::Add)
+        continue;
 
-    const auto *Ty = I.getType();
-    if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
-      continue;
+      if (AllAdds.count(&I))
+        continue;
 
-    Reduction R(&I);
-    if (!Search(&I, R))
-      continue;
+      const auto *Ty = I.getType();
+      if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+        continue;
 
-    if (!CreateParallelPairs(R))
-      continue;
+      Reduction R(&I);
+      if (!Search(&I, &BB, R))
+        continue;
+
+      R.InsertMuls();
+      LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
 
-    InsertParallelMACs(R);
-    Changed = true;
-    AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+      if (!CreateParallelPairs(R))
+        continue;
+
+      InsertParallelMACs(R);
+      Changed = true;
+      AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+    }
   }
 
   return Changed;
@@ -554,12 +559,6 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
     auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
     auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
 
-    LLVM_DEBUG(dbgs() << "Loads:\n"
-               << " - " << *Ld0 << "\n"
-               << " - " << *Ld1 << "\n"
-               << " - " << *Ld2 << "\n"
-               << " - " << *Ld3 << "\n");
-
     if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
       if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
@@ -568,8 +567,7 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
       } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
         LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
-        PMul1->Exchange = true;
-        R.AddMulPair(PMul0, PMul1);
+        R.AddMulPair(PMul0, PMul1, true);
         return true;
       }
     } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
@@ -577,9 +575,8 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
       LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
       LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
       LLVM_DEBUG(dbgs() << "    and swapping muls\n");
-      PMul0->Exchange = true;
       // Only the second operand can be exchanged, so swap the muls.
-      R.AddMulPair(PMul1, PMul0);
+      R.AddMulPair(PMul1, PMul0, true);
       return true;
     }
     return false;
@@ -587,10 +584,9 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
 
   MulCandList &Muls = R.getMuls();
   const unsigned Elems = Muls.size();
-  SmallPtrSet<const Instruction*, 4> Paired;
   for (unsigned i = 0; i < Elems; ++i) {
     MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
-    if (Paired.count(PMul0->Root))
+    if (PMul0->Paired)
       continue;
 
     for (unsigned j = 0; j < Elems; ++j) {
@@ -598,7 +594,7 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
         continue;
 
       MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
-      if (Paired.count(PMul1->Root))
+      if (PMul1->Paired)
         continue;
 
       const Instruction *Mul0 = PMul0->Root;
@@ -608,11 +604,8 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
 
       assert(PMul0 != PMul1 && "expected different chains");
 
-      if (CanPair(R, PMul0, PMul1)) {
-        Paired.insert(Mul0);
-        Paired.insert(Mul1);
+      if (CanPair(R, PMul0, PMul1))
         break;
-      }
     }
   }
   return !R.getMulPairs().empty();
@@ -646,18 +639,33 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
 
   Instruction *InsertAfter = R.getRoot();
   Value *Acc = R.getAccumulator();
+
+  // For any muls that were discovered but not paired, accumulate their values
+  // as before.
+  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+                              ++BasicBlock::iterator(InsertAfter));
+  MulCandList &MulCands = R.getMuls();
+  for (auto &MulCand : MulCands) {
+    if (MulCand->Paired)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *MulCand->Root
+               << "\n");
+    if (!Acc) {
+      Acc = MulCand->Root;
+      continue;
+    }
+    Acc = Builder.CreateAdd(MulCand->Root, Acc);
+    InsertAfter = cast<Instruction>(Acc);
+  }
+
   if (!Acc)
     Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
 
   IntegerType *Ty = IntegerType::get(M->getContext(), 32);
-  LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
-             << "Acc: " << *Acc << "\n");
   for (auto &Pair : R.getMulPairs()) {
     MulCandidate *LHSMul = Pair.first;
     MulCandidate *RHSMul = Pair.second;
-    LLVM_DEBUG(dbgs() << "Muls:\n"
-               << "- " << *LHSMul->Root << "\n"
-               << "- " << *RHSMul->Root << "\n");
     LoadInst *BaseLHS = LHSMul->getBaseLoad();
     LoadInst *BaseRHS = RHSMul->getBaseLoad();
     LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
@@ -724,14 +732,25 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
   // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
   // TODO: Support big-endian as well.
   Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
-  BaseSExt->setOperand(0, Bottom);
+  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
+  BaseSExt->replaceAllUsesWith(NewBaseSExt);
 
   IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
   Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
   Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
   Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
-  OffsetSExt->setOperand(0, Trunc);
-
+  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
+  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+
+  LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
+             << *Base << "\n" << *Offset << "\n"
+             << "Created Wide Load:\n"
+             << *WideLoad << "\n"
+             << *Bottom << "\n"
+             << *NewBaseSExt << "\n"
+             << *Top << "\n"
+             << *Trunc << "\n"
+             << *NewOffsetSExt << "\n");
   WideLoads.emplace(std::make_pair(Base,
                                    std::make_unique<WidenedLoad>(Loads, WideLoad)));
   return WideLoad;
@@ -744,6 +763,6 @@ Pass *llvm::createARMParallelDSPPass() {
 char ARMParallelDSP::ID = 0;
 
 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
-                "Transform loops to use DSP intrinsics", false, false)
+                "Transform functions to use DSP intrinsics", false, false)
 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
-                "Transform loops to use DSP intrinsics", false, false)
+                "Transform functions to use DSP intrinsics", false, false)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 7ea842baa5e1d3..21e75d55a8c032 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1176,18 +1176,17 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
     return false;
   }
 
-  // TODO: handle reductions when tail is folded by masking.
-  if (!Reductions.empty()) {
-    reportVectorizationFailure(
-        "Loop has reductions, cannot fold tail by masking",
-        "Cannot fold tail by masking in the presence of reductions.",
-        "ReductionFoldingTailByMasking", ORE, TheLoop);
-    return false;
-  }
+  SmallPtrSet<const Value *, 8> ReductionLiveOuts;
 
-  // TODO: handle outside users when tail is folded by masking.
+  for (auto &Reduction : *getReductionVars())
+    ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+
+  // TODO: handle non-reduction outside users when tail is folded by masking.
   for (auto *AE : AllowedExit) {
-    // Check that all users of allowed exit values are inside the loop.
+    // Check that all users of allowed exit values are inside the loop or
+    // are the live-out of a reduction.
+    if (ReductionLiveOuts.count(AE))
+      continue;
     for (User *U : AE->users()) {
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 870ac70057107c..478174f8251c1c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3678,6 +3678,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   setDebugLocFromInst(Builder, LoopExitInst);
 
+  // If tail is folded by masking, the vector value to leave the loop should be
+  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
+  // instead of the former.
+  if (Cost->foldTailByMasking()) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *VecLoopExitInst =
+          VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Sel = nullptr;
+      for (User *U : VecLoopExitInst->users()) {
+        if (isa<SelectInst>(U)) {
+          assert(!Sel && "Reduction exit feeding two selects");
+          Sel = U;
+        } else
+          assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
+      }
+      assert(Sel && "Reduction exit feeds no select");
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+    }
+  }
+
   // If the vector reduction can be performed in a smaller type, we truncate
   // then extend the loop exit value to enable InstCombine to evaluate the
   // entire expression in the smaller type.
@@ -6939,8 +6959,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
 
   // If the tail is to be folded by masking, the primary induction variable
   // needs to be represented in VPlan for it to model early-exit masking.
-  if (CM.foldTailByMasking())
+  // Also, both the Phi and the live-out instruction of each reduction are
+  // required in order to introduce a select between them in VPlan.
+  if (CM.foldTailByMasking()) {
     NeedDef.insert(Legal->getPrimaryInduction());
+    for (auto &Reduction : *Legal->getReductionVars()) {
+      NeedDef.insert(Reduction.first);
+      NeedDef.insert(Reduction.second.getLoopExitInstr());
+    }
+  }
 
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
@@ -7067,6 +7094,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
   delete PreEntry;
 
+  // Finally, if tail is folded by masking, introduce selects between the phi
+  // and the live-out instruction of each reduction, at the end of the latch.
+  if (CM.foldTailByMasking()) {
+    Builder.setInsertPoint(VPBB);
+    auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+    for (auto &Reduction : *Legal->getReductionVars()) {
+      VPValue *Phi = Plan->getVPValue(Reduction.first);
+      VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+    }
+  }
+
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
   unsigned VF = Range.Start;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 517d759d7bfce4..14adb478cd8636 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -309,6 +309,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     State.set(this, V, Part);
     break;
   }
+  case Instruction::Select: {
+    Value *Cond = State.get(getOperand(0), Part);
+    Value *Op1 = State.get(getOperand(1), Part);
+    Value *Op2 = State.get(getOperand(2), Part);
+    Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+    State.set(this, V, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index ec96f055a056d1..c528f5d0ceee92 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -37,8 +37,7 @@
 ; CHECK-NEXT:      Scalar Evolution Analysis
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results
-; CHECK-NEXT:      Loop Pass Manager
-; CHECK-NEXT:        Transform loops to use DSP intrinsics
+; CHECK-NEXT:      Transform functions to use DSP intrinsics
 ; CHECK-NEXT:      Interleaved Access Pass
 ; CHECK-NEXT:      ARM IR optimizations
 ; CHECK-NEXT:      Dominator Tree Construction
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll
new file mode 100644
index 00000000000000..d9dbd960974248
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll
@@ -0,0 +1,79 @@
+; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: single_block
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
+define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %add = add i32 %mul.0, %mul.1
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: multi_block
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK:  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
+define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %add = add i32 %mul.0, %mul.1
+  br label %bb.1
+
+bb.1:
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: multi_block_1
+; CHECK-NOT: call i32 @llvm.arm.smlad
+define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  br label %bb.1
+
+bb.1:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %add = add i32 %mul.0, %mul.1
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll
new file mode 100644
index 00000000000000..c072df49cdf2dc
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll
@@ -0,0 +1,329 @@
+; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: exchange_1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.1
+  %mul.1 = mul i32 %sext.a.1, %sext.b.0
+  %add = add i32 %mul.0, %mul.1
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_2
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.b.1, %sext.a.0
+  %mul.1 = mul i32 %sext.b.0, %sext.a.1
+  %add = add i32 %mul.0, %mul.1
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_3
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
+define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.1
+  %mul.1 = mul i32 %sext.a.1, %sext.b.0
+  %add = add i32 %mul.1, %mul.0
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_4
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
+define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.b.1, %sext.a.0
+  %mul.1 = mul i32 %sext.b.0, %sext.a.1
+  %add = add i32 %mul.1, %mul.0
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_multi_use_1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
+; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]])
+define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.1
+  %mul.1 = mul i32 %sext.a.1, %sext.b.0
+  %add = add i32 %mul.0, %mul.1
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.a.3 = getelementptr i16, i16* %a, i32 3
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.a.3 = load i16, i16* %addr.a.3
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.a.3 = sext i16 %ld.a.3 to i32
+  %mul.2 = mul i32 %sext.a.3, %sext.b.1
+  %mul.3 = mul i32 %sext.a.2, %sext.b.0
+  %add.1 = add i32 %mul.2, %mul.3
+  %add.2 = add i32 %add, %add.1
+  %res = add i32 %add.2, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_multi_use_2
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]])
+define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %add = add i32 %mul.0, %mul.1
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.a.3 = getelementptr i16, i16* %a, i32 3
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.a.3 = load i16, i16* %addr.a.3
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.a.3 = sext i16 %ld.a.3 to i32
+  %mul.2 = mul i32 %sext.b.0, %sext.a.3
+  %mul.3 = mul i32 %sext.b.1, %sext.a.2
+  %add.1 = add i32 %mul.2, %mul.3
+  %add.2 = add i32 %add, %add.1
+  %res = add i32 %add.2, %acc
+  ret i32 %res
+}
+
+; TODO: Why aren't two intrinsics generated?
+; CHECK-LABEL: exchange_multi_use_3
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK-NOT: call i32 @llvm.arm.smlad
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0
+define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.a.3 = getelementptr i16, i16* %a, i32 3
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.a.3 = load i16, i16* %addr.a.3
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.a.3 = sext i16 %ld.a.3 to i32
+  %mul.2 = mul i32 %sext.b.0, %sext.a.3
+  %mul.3 = mul i32 %sext.b.1, %sext.a.2
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %add = add i32 %mul.0, %mul.1
+  %add.1 = add i32 %mul.2, %mul.3
+  %sub = sub i32 %add, %add.1
+  %res = add i32 %acc, %sub
+  ret i32 %res
+}
+
+; TODO: Why isn't smladx generated too?
+; CHECK-LABEL: exchange_multi_use_4
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0
+; CHECK-NOT: call i32 @llvm.arm.smlad
+define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.a.3 = getelementptr i16, i16* %a, i32 3
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.a.3 = load i16, i16* %addr.a.3
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.a.3 = sext i16 %ld.a.3 to i32
+  %mul.2 = mul i32 %sext.b.0, %sext.a.3
+  %mul.3 = mul i32 %sext.b.1, %sext.a.2
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %add.1 = add i32 %mul.2, %mul.3
+  %add = add i32 %mul.0, %mul.1
+  %sub = sub i32 %add, %add.1
+  %res = add i32 %acc, %sub
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_swap
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
+define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.1, %sext.b.0
+  %mul.1 = mul i32 %sext.a.0, %sext.b.1
+  %add = add i32 %mul.0, %mul.1
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_swap_2
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.1, %sext.b.0
+  %mul.1 = mul i32 %sext.a.0, %sext.b.1
+  %add = add i32 %mul.1, %mul.0
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: exchange_swap_3
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.b.0, %sext.a.1
+  %mul.1 = mul i32 %sext.b.1, %sext.a.0
+  %add = add i32 %mul.1, %mul.0
+  %res = add i32 %add, %acc
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll
new file mode 100644
index 00000000000000..a071ec3e748f2b
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll
@@ -0,0 +1,172 @@
+; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: overlap_1
+; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
+; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*
+; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]
+; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*
+; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc)
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]])
+; CHECK: ret i32 [[RES]]
+define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.b.2 = getelementptr i16, i16* %b, i32 2
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.b.2 = load i16, i16* %addr.b.2
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.b.2 = sext i16 %ld.b.2 to i32
+  %mul.2 = mul i32 %sext.a.2, %sext.b.2
+  %add = add i32 %mul.0, %mul.1
+  %add.1 = add i32 %mul.1, %mul.2
+  %add.2 = add i32 %add.1, %add
+  %res = add i32 %add.2, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: overlap_2
+; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
+; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[ACC1:%[^ ]+]] = add i32 %mul.1, %acc
+; CHECK: [[ACC2:%[^ ]+]] = add i32 %mul.2, [[ACC1]]
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC2]])
+; CHECK: ret i32 [[RES]]
+define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.b.2 = getelementptr i16, i16* %b, i32 2
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.b.2 = load i16, i16* %addr.b.2
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.b.2 = sext i16 %ld.b.2 to i32
+  %mul.2 = mul i32 %sext.b.2, %sext.a.2
+  %add = add i32 %mul.0, %mul.1
+  %add.1 = add i32 %mul.1, %mul.2
+  %add.2 = add i32 %add, %add.1
+  %res = add i32 %add.2, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: overlap_3
+; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
+; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
+; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
+; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
+define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.b.2 = getelementptr i16, i16* %b, i32 2
+  %addr.a.3 = getelementptr i16, i16* %a, i32 3
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.b.2 = load i16, i16* %addr.b.2
+  %ld.a.3 = load i16, i16* %addr.a.3
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.b.2 = sext i16 %ld.b.2 to i32
+  %sext.a.3 = sext i16 %ld.a.3 to i32
+  %mul.2 = mul i32 %sext.a.2, %sext.b.1
+  %mul.3 = mul i32 %sext.a.3, %sext.b.2
+  %add = add i32 %mul.0, %mul.1
+  %add.1 = add i32 %mul.2, %mul.3
+  %add.2 = add i32 %add.1, %add
+  %res = add i32 %add.2, %acc
+  ret i32 %res
+}
+
+; CHECK-LABEL: overlap_4
+; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
+; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
+; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
+; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
+define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) {
+entry:
+  %addr.a.1 = getelementptr i16, i16* %a, i32 1
+  %addr.b.1 = getelementptr i16, i16* %b, i32 1
+  %ld.a.0 = load i16, i16* %a
+  %sext.a.0 = sext i16 %ld.a.0 to i32
+  %ld.b.0 = load i16, i16* %b
+  %ld.a.1 = load i16, i16* %addr.a.1
+  %ld.b.1 = load i16, i16* %addr.b.1
+  %sext.a.1 = sext i16 %ld.a.1 to i32
+  %sext.b.1 = sext i16 %ld.b.1 to i32
+  %sext.b.0 = sext i16 %ld.b.0 to i32
+  %mul.0 = mul i32 %sext.a.0, %sext.b.0
+  %mul.1 = mul i32 %sext.a.1, %sext.b.1
+  %addr.a.2 = getelementptr i16, i16* %a, i32 2
+  %addr.b.2 = getelementptr i16, i16* %b, i32 2
+  %addr.a.3 = getelementptr i16, i16* %a, i32 3
+  %ld.a.2 = load i16, i16* %addr.a.2
+  %ld.b.2 = load i16, i16* %addr.b.2
+  %ld.a.3 = load i16, i16* %addr.a.3
+  %sext.a.2 = sext i16 %ld.a.2 to i32
+  %sext.b.2 = sext i16 %ld.b.2 to i32
+  %sext.a.3 = sext i16 %ld.a.3 to i32
+  %mul.2 = mul i32 %sext.b.2, %sext.a.2
+  %mul.3 = mul i32 %sext.b.1, %sext.a.3
+  %add = add i32 %mul.0, %mul.1
+  %add.1 = add i32 %mul.2, %mul.3
+  %add.2 = add i32 %add.1, %add
+  %res = add i32 %add.2, %acc
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll
index e422eadd20c8ce..9f032cd24857c9 100644
--- a/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr42729.ll
@@ -9,7 +9,7 @@
 ; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 6
 ; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32*
 ; CHECK: [[LOAD_UNDEF:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_UNDEF]], i32 undef)
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_UNDEF]], i32 [[LOAD_A]], i32 undef)
 define void @undef_no_return(i16* %a) {
 entry:
   %incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3
@@ -48,7 +48,7 @@ for.body:
 ; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 %iv
 ; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32*
 ; CHECK: [[LOAD_B:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2
-; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_B]], i32 [[ACC]])
+; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_B]], i32 [[LOAD_A]], i32 [[ACC]])
 define i32 @return(i16* %a, i8* %b, i32 %N) {
 entry:
   %incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll
new file mode 100644
index 00000000000000..a2f4745c68aef7
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll
@@ -0,0 +1,145 @@
+; RUN: opt -mtriple=thumbv7-unknown-linux-gnueabihf -arm-parallel-dsp -dce %s -S -o - | FileCheck %s
+
+; CHECK-LABEL: first_mul_invalid
+; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1
+; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2
+; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32
+; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1
+; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2
+; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32
+; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]]
+; CHECK: [[ADD0:%[^ ]+]] = add i32 [[MUL0]], %call
+; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3
+; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32*
+; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2
+; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2
+; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*
+; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2
+; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5
+; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*
+; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2
+; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4
+; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*
+; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]])
+; CHECK: ret i32 [[RES]]
+define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) {
+entry:
+  %0 = load i16, i16* %in, align 2
+  %conv = sext i16 %0 to i32
+  %1 = load i16, i16* %b, align 2
+  %conv2 = sext i16 %1 to i32
+  %call = tail call i32 @bar(i32 %conv, i32 %conv2)
+  %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv4 = sext i16 %2 to i32
+  %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1
+  %3 = load i16, i16* %arrayidx5, align 2
+  %conv6 = sext i16 %3 to i32
+  %mul = mul nsw i32 %conv6, %conv4
+  %add = add i32 %mul, %call
+  %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2
+  %4 = load i16, i16* %arrayidx7, align 2
+  %conv8 = sext i16 %4 to i32
+  %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2
+  %5 = load i16, i16* %arrayidx9, align 2
+  %conv10 = sext i16 %5 to i32
+  %mul11 = mul nsw i32 %conv10, %conv8
+  %add12 = add i32 %add, %mul11
+  %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3
+  %6 = load i16, i16* %arrayidx13, align 2
+  %conv14 = sext i16 %6 to i32
+  %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3
+  %7 = load i16, i16* %arrayidx15, align 2
+  %conv16 = sext i16 %7 to i32
+  %mul17 = mul nsw i32 %conv16, %conv14
+  %add18 = add i32 %add12, %mul17
+  %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = sext i16 %8 to i32
+  %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4
+  %9 = load i16, i16* %arrayidx21, align 2
+  %conv22 = sext i16 %9 to i32
+  %mul23 = mul nsw i32 %conv22, %conv20
+  %add24 = add i32 %add18, %mul23
+  %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5
+  %10 = load i16, i16* %arrayidx25, align 2
+  %conv26 = sext i16 %10 to i32
+  %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5
+  %11 = load i16, i16* %arrayidx27, align 2
+  %conv28 = sext i16 %11 to i32
+  %mul29 = mul nsw i32 %conv28, %conv26
+  %add30 = add i32 %add24, %mul29
+  ret i32 %add30
+}
+
+; CHECK-LABEL: with_no_acc_input
+; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1
+; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2
+; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32
+; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1
+; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2
+; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32
+; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]]
+; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3
+; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32*
+; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2
+; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2
+; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*
+; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2
+; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5
+; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*
+; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2
+; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4
+; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*
+; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]])
+; CHECK: ret i32 [[RES]]
+define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) {
+entry:
+  %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1
+  %ld.2 = load i16, i16* %arrayidx3, align 2
+  %conv4 = sext i16 %ld.2 to i32
+  %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1
+  %ld.3 = load i16, i16* %arrayidx5, align 2
+  %conv6 = sext i16 %ld.3 to i32
+  %mul = mul nsw i32 %conv6, %conv4
+  %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2
+  %ld.4 = load i16, i16* %arrayidx7, align 2
+  %conv8 = sext i16 %ld.4 to i32
+  %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2
+  %ld.5 = load i16, i16* %arrayidx9, align 2
+  %conv10 = sext i16 %ld.5 to i32
+  %mul11 = mul nsw i32 %conv10, %conv8
+  %add12 = add i32 %mul, %mul11
+  %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3
+  %ld.6 = load i16, i16* %arrayidx13, align 2
+  %conv14 = sext i16 %ld.6 to i32
+  %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3
+  %ld.7 = load i16, i16* %arrayidx15, align 2
+  %conv16 = sext i16 %ld.7 to i32
+  %mul17 = mul nsw i32 %conv16, %conv14
+  %add18 = add i32 %add12, %mul17
+  %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4
+  %ld.8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = sext i16 %ld.8 to i32
+  %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4
+  %ld.9 = load i16, i16* %arrayidx21, align 2
+  %conv22 = sext i16 %ld.9 to i32
+  %mul23 = mul nsw i32 %conv22, %conv20
+  %add24 = add i32 %add18, %mul23
+  %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5
+  %ld.10 = load i16, i16* %arrayidx25, align 2
+  %conv26 = sext i16 %ld.10 to i32
+  %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5
+  %ld.11 = load i16, i16* %arrayidx27, align 2
+  %conv28 = sext i16 %ld.11 to i32
+  %mul29 = mul nsw i32 %conv28, %conv26
+  %add30 = add i32 %add24, %mul29
+  ret i32 %add30
+}
+
+declare dso_local i32 @bar(i32, i32) local_unnamed_addr
+
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll
index b17106e70ed02e..22744be02b0b77 100644
--- a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll
@@ -12,8 +12,8 @@
 ; CHECK:  [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
 ; CHECK:  [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
 ; CHECK:  [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
-; CHECK:  [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
-; CHECK:  [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[V12]])
+; CHECK:  [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054)
+; CHECK:  [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]])
 ;
 ; And we don't want to see a 3rd smlad:
 ; CHECK-NOT: call i32 @llvm.arm.smlad
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll
index d4e09ca3fbb114..637fc3d37046b0 100644
--- a/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlad12.ll
@@ -2,7 +2,7 @@
 ;
 ; The loop header is not the loop latch.
 ;
-; CHECK-NOT:  call i32 @llvm.arm.smlad
+; CHECK:  call i32 @llvm.arm.smlad
 ;
 define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
 entry:
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
index 971c85f1b665bf..07cc1b41ed26c2 100644
--- a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
@@ -195,8 +195,8 @@ for.cond.cleanup:
 ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
 ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
 
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
-; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN1]], i32 [[IN2_2]], i64 [[ACC1]])
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
+; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]])
 
 ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index eb0b499f512eff..d9db8a243a3a43 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -78,6 +78,62 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
 }
 
+; Check that fold tail under optsize passes the reduction live-out value
+; through a select.
+; int reduction_i32(int *A, int *B, int N) {
+;   int sum = 0;
+;   for (int i = 0; i < N; ++i)
+;     sum += (A[i] + B[i]);
+;   return sum;
+; }
+;
+define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) #0 {
+; CHECK-LABEL: @reduction_i32(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ [[ACCUM:%.*]], %vector.body ]
+; CHECK:         [[ICMPULE:%.*]] = icmp ule <8 x i64>
+; CHECK:         [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
+; CHECK:         [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT:    [[ACCUM]] = add nuw nsw <8 x i32> [[ADD]], [[ACCUM_PHI]]
+; CHECK:         [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[LIVEOUT]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[LIVEOUT]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF6:%.*]] = shufflevector <8 x i32> [[BIN_RDX5]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i32> [[BIN_RDX5]], [[RDX_SHUF6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[BIN_RDX7]], i32 0
+; CHECK-NEXT:    br i1 true, label %for.cond.cleanup, label %scalar.ph
+; CHECK:       scalar.ph:
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ {{.*}}, %for.body ], [ [[TMP17]], %middle.block ]
+; CHECK-NEXT:    ret i32 [[SUM_1_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.0 = phi i32 [ %sum.1, %for.body ], [ 0, %entry ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidxA, align 4
+  %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidxB, align 4
+  %add = add nsw i32 %1, %0
+  %sum.1 = add nuw nsw i32 %add, %sum.0
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %sum.1
+}
+
 ; CHECK:      !0 = distinct !{!0, !1}
 ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-NEXT: !2 = distinct !{!2, !3, !1}