From 0c9366b6c609c04e0281a637dc50af0c77f47004 Mon Sep 17 00:00:00 2001
From: Hartmut Kaiser <hartmut.kaiser@gmail.com>
Date: Tue, 21 Jul 2015 10:39:36 -0500
Subject: [PATCH 1/4] Streamlining transpose examples, adding transpose_await

---
 examples/transpose/CMakeLists.txt          |   6 +
 examples/transpose/transpose.cpp           |  55 +--
 examples/transpose/transpose_await.cpp     | 460 +++++++++++++++++++++
 examples/transpose/transpose_smp.cpp       |   6 +-
 examples/transpose/transpose_smp_block.cpp |   3 +-
 5 files changed, 500 insertions(+), 30 deletions(-)
 create mode 100644 examples/transpose/transpose_await.cpp
diff --git a/examples/transpose/CMakeLists.txt b/examples/transpose/CMakeLists.txt
index 91a8daf56fed..21787d2b7fc6 100644
--- a/examples/transpose/CMakeLists.txt
+++ b/examples/transpose/CMakeLists.txt
@@ -13,6 +13,12 @@ set(example_programs
   transpose_serial_vector
 )
 
+if(HPX_WITH_AWAIT)
+  set(example_programs
+    ${example_programs}
+    transpose_await)
+endif()
+
 foreach(example_program ${example_programs})
 
   set(sources ${example_program}.cpp)
diff --git a/examples/transpose/transpose.cpp b/examples/transpose/transpose.cpp
index 7b80640c5721..3d50a79f1cb7 100644
--- a/examples/transpose/transpose.cpp
+++ b/examples/transpose/transpose.cpp
@@ -5,8 +5,6 @@
 
 #include <hpx/hpx_init.hpp>
 #include <hpx/hpx.hpp>
-#include <hpx/runtime/serialization/serialize.hpp>
-#include <hpx/lcos/local/detail/invoke_when_ready.hpp>
 
 #include <hpx/include/parallel_algorithm.hpp>
 #include <hpx/include/parallel_numeric.hpp>
@@ -172,8 +170,7 @@ typedef block_component::get_sub_block_action get_sub_block_action;
 HPX_REGISTER_ACTION(get_sub_block_action);
 
 void transpose(hpx::future<sub_block> A, hpx::future<sub_block> B,
-    hpx::future<boost::uint64_t> block_order,
-    hpx::future<boost::uint64_t> tile_size);
+    boost::uint64_t block_order, boost::uint64_t tile_size);
 double test_results(boost::uint64_t order, boost::uint64_t block_order,
     std::vector<block> & trans, boost::uint64_t blocks_start,
     boost::uint64_t blocks_end);
@@ -214,7 +211,7 @@ int hpx_main(boost::program_options::variables_map& vm)
         boost::uint64_t blocks_end = (id + 1) * num_local_blocks;
 
         // Actually allocate the block components in AGAS
-        for(boost::uint64_t b = 0; b < num_blocks; ++b)
+        for(boost::uint64_t b = 0; b != num_blocks; ++b)
         {
             // Allocate block
             if(b >= blocks_start && b < blocks_end)
@@ -247,7 +244,6 @@ int hpx_main(boost::program_options::variables_map& vm)
         }
         using hpx::parallel::for_each;
         using hpx::parallel::par;
-        using hpx::parallel::task;
 
         // Fill the original matrix, set transpose to known garbage value.
         auto range = boost::irange(blocks_start, blocks_end);
@@ -259,9 +255,9 @@ int hpx_main(boost::program_options::variables_map& vm)
                 boost::shared_ptr<block_component> B_ptr =
                     hpx::get_ptr<block_component>(B[b].get_gid()).get();
 
-                for(boost::uint64_t i = 0; i < order; ++i)
+                for(boost::uint64_t i = 0; i != order; ++i)
                 {
-                    for(boost::uint64_t j = 0; j < block_order; ++j)
+                    for(boost::uint64_t j = 0; j != block_order; ++j)
                     {
                         double col_val = COL_SHIFT * (b*block_order + j);
                         A_ptr->data_[i * block_order + j] = col_val + ROW_SHIFT * i;
@@ -282,15 +278,17 @@ int hpx_main(boost::program_options::variables_map& vm)
 
             auto range = boost::irange(blocks_start, blocks_end);
 
-            std::vector<hpx::shared_future<void> > block_futures;
+            std::vector<hpx::future<void> > block_futures;
             block_futures.resize(num_local_blocks);
 
             for_each(par, boost::begin(range), boost::end(range),
                 [&](boost::uint64_t b)
                 {
                     std::vector<hpx::future<void> > phase_futures;
-                    //phase_futures.resize(num_local_blocks);
-                    auto phase_range = boost::irange(static_cast<boost::uint64_t>(0), num_blocks);
+                    phase_futures.reserve(num_blocks);
+
+                    auto phase_range = boost::irange(
+                        static_cast<boost::uint64_t>(0), num_blocks);
                     for(boost::uint64_t phase: phase_range)
                     {
                         const boost::uint64_t block_size = block_order * block_order;
@@ -298,19 +296,20 @@ int hpx_main(boost::program_options::variables_map& vm)
                         const boost::uint64_t from_phase = b;
                         const boost::uint64_t A_offset = from_phase * block_size;
                         const boost::uint64_t B_offset = phase * block_size;
+
                         phase_futures.push_back(
                             hpx::lcos::local::dataflow(
                                 &transpose
                               , A[from_block].get_sub_block(A_offset, block_size)
                               , B[b].get_sub_block(B_offset, block_size)
-                              , hpx::make_ready_future(block_order)
-                              , hpx::make_ready_future(tile_size)
+                              , block_order
+                              , tile_size
                             )
                         );
                     }
 
                     block_futures[b - blocks_start] =
-                        hpx::when_all(phase_futures).share();
+                        hpx::when_all(phase_futures);
                 }
             );
 
@@ -389,22 +388,23 @@ int main(int argc, char* argv[])
 }
 
 void transpose(hpx::future<sub_block> Af, hpx::future<sub_block> Bf,
-    hpx::future<boost::uint64_t> block_order_fut,
-    hpx::future<boost::uint64_t> tile_size_fut)
+    boost::uint64_t block_order, boost::uint64_t tile_size)
 {
     const sub_block A(Af.get());
     sub_block B(Bf.get());
-    boost::uint64_t block_order(block_order_fut.get());
-    boost::uint64_t tile_size(tile_size_fut.get());
+
     if(tile_size < block_order)
     {
-        for(boost::uint64_t i = 0; i < block_order; i += tile_size)
+        for(boost::uint64_t i = 0; i != block_order; i += tile_size)
         {
-            for(boost::uint64_t j = 0; j < block_order; j += tile_size)
+            for(boost::uint64_t j = 0; j != block_order; j += tile_size)
             {
-                for(boost::uint64_t it = i; it < (std::min)(block_order, i + tile_size); ++it)
+                boost::uint64_t max_i = (std::min)(block_order, i + tile_size);
+                boost::uint64_t max_j = (std::min)(block_order, j + tile_size);
+
+                for(boost::uint64_t it = i; it != max_i; ++it)
                 {
-                    for(boost::uint64_t jt = j; jt < (std::min)(block_order, j + tile_size); ++jt)
+                    for(boost::uint64_t jt = j; jt != max_j; ++jt)
                     {
                         B[it + block_order * jt] = A[jt + block_order * it];
                     }
@@ -414,9 +414,9 @@ void transpose(hpx::future<sub_block> Af, hpx::future<sub_block> Bf,
     }
     else
     {
-        for(boost::uint64_t i = 0; i < block_order; ++i)
+        for(boost::uint64_t i = 0; i != block_order; ++i)
         {
-            for(boost::uint64_t j = 0; j < block_order; ++j)
+            for(boost::uint64_t j = 0; j != block_order; ++j)
             {
                 B[i + block_order * j] = A[j + block_order * i];
             }
@@ -428,7 +428,7 @@ double test_results(boost::uint64_t order, boost::uint64_t block_order,
     std::vector<block> & trans, boost::uint64_t blocks_start,
     boost::uint64_t blocks_end)
 {
-    using hpx::parallel::for_each;
+    using hpx::parallel::transform_reduce;
     using hpx::parallel::par;
 
     // Fill the original matrix, set transpose to known garbage value.
@@ -437,7 +437,8 @@ double test_results(boost::uint64_t order, boost::uint64_t block_order,
         transform_reduce(par, boost::begin(range), boost::end(range),
             [&](boost::uint64_t b) -> double
             {
-                sub_block trans_block = trans[b].get_sub_block(0, order * block_order).get();
+                sub_block trans_block =
+                    trans[b].get_sub_block(0, order * block_order).get();
                 double errsq = 0.0;
                 for(boost::uint64_t i = 0; i < order; ++i)
                 {
@@ -445,7 +446,7 @@ double test_results(boost::uint64_t order, boost::uint64_t block_order,
                     for(boost::uint64_t j = 0; j < block_order; ++j)
                     {
                         double diff = trans_block[i * block_order + j] -
-                          (col_val + ROW_SHIFT * (b * block_order + j));
+                            (col_val + ROW_SHIFT * (b * block_order + j));
                         errsq += diff * diff;
                     }
                 }
diff --git a/examples/transpose/transpose_await.cpp b/examples/transpose/transpose_await.cpp
new file mode 100644
index 000000000000..43fe4fd46067
--- /dev/null
+++ b/examples/transpose/transpose_await.cpp
@@ -0,0 +1,460 @@
+//  Copyright (c) 2014 Thomas Heller
+//
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <hpx/hpx_init.hpp>
+#include <hpx/hpx.hpp>
+
+#include <hpx/include/parallel_algorithm.hpp>
+#include <hpx/include/parallel_numeric.hpp>
+
+#include <boost/range/irange.hpp>
+
+#include <algorithm>
+#include <vector>
+
+#define COL_SHIFT 1000.00           // Constant to shift column index
+#define ROW_SHIFT 0.001             // Constant to shift row index
+
+bool verbose = false;
+
+char const* A_block_basename = "/transpose/block/A";
+char const* B_block_basename = "/transpose/block/B";
+
+struct sub_block
+{
+    enum mode {
+        reference
+      , owning
+    };
+
+    sub_block()
+      : size_(0)
+      , data_(0)
+      , mode_(reference)
+    {}
+
+    sub_block(double * data, boost::uint64_t size)
+      : size_(size)
+      , data_(data)
+      , mode_(reference)
+    {}
+
+    ~sub_block()
+    {
+        if(data_ && mode_ == owning)
+        {
+            delete[] data_;
+        }
+    }
+
+    sub_block(sub_block && other)
+      : size_(other.size_)
+      , data_(other.data_)
+      , mode_(other.mode_)
+    {
+        if(mode_ == owning) { other.data_ = 0; other.size_ = 0; }
+    }
+
+    sub_block & operator=(sub_block && other)
+    {
+        size_ = other.size_;
+        data_ = other.data_;
+        mode_ = other.mode_;
+        if(mode_ == owning) { other.data_ = 0; other.size_ = 0; }
+
+        return *this;
+    }
+
+    double operator[](std::size_t i) const
+    {
+        HPX_ASSERT(data_);
+        return data_[i];
+    }
+
+    double & operator[](std::size_t i)
+    {
+        HPX_ASSERT(data_);
+        HPX_ASSERT(mode_ == reference);
+        return data_[i];
+    }
+
+    void load(hpx::serialization::input_archive & ar, unsigned version)
+    {
+        ar & size_;
+        if(size_ > 0)
+        {
+            data_ = new double[size_];
+            hpx::serialization::array<double> arr(data_, size_);
+            ar >> arr;
+            mode_ = owning;
+        }
+    }
+
+    void save(hpx::serialization::output_archive & ar, unsigned version) const
+    {
+        ar & size_;
+        if(size_ > 0)
+        {
+            hpx::serialization::array<double> arr(data_, size_);
+            ar << arr;
+        }
+    }
+
+    HPX_SERIALIZATION_SPLIT_MEMBER()
+
+    boost::uint64_t size_;
+    double * data_;
+    mode mode_;
+
+    HPX_MOVABLE_BUT_NOT_COPYABLE(sub_block);
+};
+
+struct block_component
+  : hpx::components::simple_component_base<block_component>
+{
+    block_component() {}
+
+    block_component(boost::uint64_t size)
+      : data_(size)
+    {}
+
+    sub_block get_sub_block(boost::uint64_t offset, boost::uint64_t size)
+    {
+        HPX_ASSERT(!data_.empty());
+        return sub_block(&data_[offset], size);
+    }
+
+    HPX_DEFINE_COMPONENT_ACTION(block_component, get_sub_block);
+
+    std::vector<double> data_;
+};
+
+struct block
+  : hpx::components::client_base<block, block_component>
+{
+    typedef hpx::components::client_base<block, block_component> base_type;
+    block() {}
+
+    block(boost::uint64_t id, const char * base_name)
+      : base_type(hpx::find_id_from_basename(base_name, id))
+    {
+        get_gid();
+    }
+
+    block(boost::uint64_t id, boost::uint64_t size, const char * base_name)
+      : base_type(hpx::new_<block_component>(hpx::find_here(), size))
+    {
+        hpx::register_id_with_basename(base_name, get_gid(), id);
+    }
+
+    hpx::future<sub_block>
+        get_sub_block(boost::uint64_t offset, boost::uint64_t size) const
+    {
+        block_component::get_sub_block_action act;
+        return hpx::async(act, get_gid(), offset, size);
+    }
+};
+
+// The macros below are necessary to generate the code required for exposing
+// our block_component type remotely.
+//
+// HPX_REGISTER_COMPONENT() exposes the component creation
+// through hpx::new_<>().
+typedef hpx::components::simple_component<block_component> block_component_type;
+HPX_REGISTER_COMPONENT(block_component_type, block_component);
+
+// HPX_REGISTER_ACTION() exposes the component member function for remote
+// invocation.
+typedef block_component::get_sub_block_action get_sub_block_action;
+HPX_REGISTER_ACTION(get_sub_block_action);
+
+void transpose(sub_block const A, sub_block B,
+    boost::uint64_t block_order, boost::uint64_t tile_size);
+
+double test_results(boost::uint64_t order, boost::uint64_t block_order,
+    std::vector<block> & trans, boost::uint64_t blocks_start,
+    boost::uint64_t blocks_end);
+
+////////////////////////////////////////////////////////////////////////////////
+hpx::future<void> transpose_phase(
+    std::vector<block> const& A, std::vector<block>& B,
+    boost::uint64_t block_order, boost::uint64_t b,
+    boost::uint64_t num_blocks, boost::uint64_t num_local_blocks,
+    boost::uint64_t block_size, boost::uint64_t tile_size)
+{
+    const boost::uint64_t from_phase = b;
+    const boost::uint64_t A_offset = from_phase * block_size;
+
+    auto phase_range = boost::irange(
+        static_cast<boost::uint64_t>(0), num_blocks);
+    for(boost::uint64_t phase: phase_range)
+    {
+        const boost::uint64_t from_block = phase;
+        const boost::uint64_t B_offset = phase * block_size;
+
+        hpx::future<sub_block> from =
+            A[from_block].get_sub_block(A_offset, block_size);
+        hpx::future<sub_block> to =
+            B[b].get_sub_block(B_offset, block_size);
+
+        transpose(__await from, __await to, block_order, tile_size);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int hpx_main(boost::program_options::variables_map& vm)
+{
+    {
+        hpx::id_type here = hpx::find_here();
+        bool root = here == hpx::find_root_locality();
+
+        boost::uint64_t num_localities = hpx::get_num_localities().get();
+
+        boost::uint64_t order = vm["matrix_size"].as<boost::uint64_t>();
+        boost::uint64_t iterations = vm["iterations"].as<boost::uint64_t>();
+        boost::uint64_t num_local_blocks = vm["num_blocks"].as<boost::uint64_t>();
+        boost::uint64_t tile_size = order;
+
+        if(vm.count("tile_size"))
+            tile_size = vm["tile_size"].as<boost::uint64_t>();
+
+        verbose = vm.count("verbose") ? true : false;
+
+        boost::uint64_t bytes =
+            static_cast<boost::uint64_t>(2.0 * sizeof(double) * order * order);
+
+        boost::uint64_t num_blocks = num_localities * num_local_blocks;
+
+        boost::uint64_t block_order = order / num_blocks;
+        boost::uint64_t col_block_size = order * block_order;
+
+        boost::uint64_t id = hpx::get_locality_id();
+
+        std::vector<block> A(num_blocks);
+        std::vector<block> B(num_blocks);
+
+        boost::uint64_t blocks_start = id * num_local_blocks;
+        boost::uint64_t blocks_end = (id + 1) * num_local_blocks;
+
+        // Actually allocate the block components in AGAS
+        for(boost::uint64_t b = 0; b != num_blocks; ++b)
+        {
+            // Allocate block
+            if(b >= blocks_start && b != blocks_end)
+            {
+                A[b] = block(b, col_block_size, A_block_basename);
+                B[b] = block(b, col_block_size, B_block_basename);
+            }
+            // Retrieve the block by it's symbolic name
+            else
+            {
+                A[b] = block(b, A_block_basename);
+                B[b] = block(b, B_block_basename);
+            }
+        }
+
+        if(root)
+        {
+            std::cout
+                << "Serial Matrix transpose: B = A^T\n"
+                << "Matrix order          = " << order << "\n"
+                << "Matrix local columns  = " << block_order << "\n"
+                << "Number of blocks      = " << num_blocks << "\n"
+                << "Number of localities  = " << num_localities << "\n";
+            if(tile_size < order)
+                std::cout << "Tile size             = " << tile_size << "\n";
+            else
+                std::cout << "Untiled\n";
+            std::cout
+                << "Number of iterations  = " << iterations << "\n";
+        }
+        using hpx::parallel::for_each;
+        using hpx::parallel::par;
+
+        // Fill the original matrix, set transpose to known garbage value.
+        auto range = boost::irange(blocks_start, blocks_end);
+        for_each(par, boost::begin(range), boost::end(range),
+            [&](boost::uint64_t b)
+            {
+                boost::shared_ptr<block_component> A_ptr =
+                    hpx::get_ptr<block_component>(A[b].get_gid()).get();
+                boost::shared_ptr<block_component> B_ptr =
+                    hpx::get_ptr<block_component>(B[b].get_gid()).get();
+
+                for(boost::uint64_t i = 0; i != order; ++i)
+                {
+                    for(boost::uint64_t j = 0; j != block_order; ++j)
+                    {
+                        double col_val = COL_SHIFT * (b*block_order + j);
+                        A_ptr->data_[i * block_order + j] = col_val + ROW_SHIFT * i;
+                        B_ptr->data_[i * block_order + j] = -1.0;
+                    }
+                }
+            }
+        );
+
+        double errsq = 0.0;
+        double avgtime = 0.0;
+        double maxtime = 0.0;
+        double mintime = 366.0 * 24.0*3600.0; // set the minimum time to a large value;
+                                              // one leap year should be enough
+        for(boost::uint64_t iter = 0; iter < iterations; ++iter)
+        {
+            hpx::util::high_resolution_timer t;
+
+            auto range = boost::irange(blocks_start, blocks_end);
+
+            const boost::uint64_t block_size = block_order * block_order;
+            for_each(par, boost::begin(range), boost::end(range),
+                [&](boost::uint64_t b)
+                {
+                    transpose_phase(A, B, block_order, b,
+                        num_blocks, num_local_blocks, block_size, tile_size
+                    );
+                });
+
+            double elapsed = t.elapsed();
+
+            if(iter > 0 || iterations == 1) // Skip the first iteration
+            {
+                avgtime = avgtime + elapsed;
+                maxtime = (std::max)(maxtime, elapsed);
+                mintime = (std::min)(mintime, elapsed);
+            }
+
+            if(root)
+                errsq += test_results(order, block_order, B, blocks_start, blocks_end);
+        } // end of iter loop
+
+        // Analyze and output results
+
+        double epsilon = 1.e-8;
+        if(root)
+        {
+            if(errsq < epsilon)
+            {
+                std::cout << "Solution validates\n";
+                avgtime = avgtime/static_cast<double>(
+                    (std::max)(iterations-1, static_cast<boost::uint64_t>(1)));
+                std::cout
+                  << "Rate (MB/s): " << 1.e-6 * bytes/mintime << ", "
+                  << "Avg time (s): " << avgtime << ", "
+                  << "Min time (s): " << mintime << ", "
+                  << "Max time (s): " << maxtime << "\n";
+
+                if(verbose)
+                    std::cout << "Squared errors: " << errsq << "\n";
+            }
+            else
+            {
+                std::cout
+                  << "ERROR: Aggregate squared error " << errsq
+                  << " exceeds threshold " << epsilon << "\n";
+                hpx::terminate();
+            }
+        }
+    }
+
+    return hpx::finalize();
+}
+
+int main(int argc, char* argv[])
+{
+    using namespace boost::program_options;
+
+    options_description desc_commandline;
+    desc_commandline.add_options()
+        ("matrix_size", value<boost::uint64_t>()->default_value(1024),
+         "Matrix Size")
+        ("iterations", value<boost::uint64_t>()->default_value(10),
+         "# iterations")
+        ("tile_size", value<boost::uint64_t>(),
+         "Number of tiles to divide the individual matrix blocks for improved "
+         "cache and TLB performance")
+        ("num_blocks", value<boost::uint64_t>()->default_value(1),
+         "Number of blocks to divide the individual matrix blocks for "
+         "improved cache and TLB performance")
+        ( "verbose", "Verbose output")
+    ;
+
+    // Initialize and run HPX, this example requires to run hpx_main on all
+    // localities
+    std::vector<std::string> cfg;
+    cfg.push_back("hpx.run_hpx_main!=1");
+
+    return hpx::init(desc_commandline, argc, argv, cfg);
+}
+
+void transpose(sub_block const A, sub_block B,
+    boost::uint64_t block_order, boost::uint64_t tile_size)
+{
+    if(tile_size < block_order)
+    {
+        for(boost::uint64_t i = 0; i != block_order; i += tile_size)
+        {
+            for(boost::uint64_t j = 0; j != block_order; j += tile_size)
+            {
+                boost::uint64_t max_i = (std::min)(block_order, i + tile_size);
+                boost::uint64_t max_j = (std::min)(block_order, j + tile_size);
+
+                for(boost::uint64_t it = i; it != max_i; ++it)
+                {
+                    for(boost::uint64_t jt = j; jt != max_j; ++jt)
+                    {
+                        B[it + block_order * jt] = A[jt + block_order * it];
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        for(boost::uint64_t i = 0; i != block_order; ++i)
+        {
+            for(boost::uint64_t j = 0; j != block_order; ++j)
+            {
+                B[i + block_order * j] = A[j + block_order * i];
+            }
+        }
+    }
+}
+
+double test_results(boost::uint64_t order, boost::uint64_t block_order,
+    std::vector<block> & trans, boost::uint64_t blocks_start,
+    boost::uint64_t blocks_end)
+{
+    using hpx::parallel::transform_reduce;
+    using hpx::parallel::par;
+
+    // Fill the original matrix, set transpose to known garbage value.
+    auto range = boost::irange(blocks_start, blocks_end);
+    double errsq =
+        transform_reduce(par, boost::begin(range), boost::end(range),
+            [&](boost::uint64_t b) -> double
+            {
+                sub_block trans_block =
+                    trans[b].get_sub_block(0, order * block_order).get();
+                double errsq = 0.0;
+                for(boost::uint64_t i = 0; i < order; ++i)
+                {
+                    double col_val = COL_SHIFT * i;
+                    for(boost::uint64_t j = 0; j < block_order; ++j)
+                    {
+                        double diff = trans_block[i * block_order + j] -
+                            (col_val + ROW_SHIFT * (b * block_order + j));
+                        errsq += diff * diff;
+                    }
+                }
+                return errsq;
+            },
+            0.0,
+            [](double lhs, double rhs) { return lhs + rhs; }
+        );
+
+    if(verbose)
+        std::cout << " Squared sum of differences: " << errsq << "\n";
+
+    return errsq;
+}
diff --git a/examples/transpose/transpose_smp.cpp b/examples/transpose/transpose_smp.cpp
index 3344a124cd80..367c3fec9071 100644
--- a/examples/transpose/transpose_smp.cpp
+++ b/examples/transpose/transpose_smp.cpp
@@ -86,9 +86,10 @@ int hpx_main(boost::program_options::variables_map& vm)
                     for(boost::uint64_t j = 0; j < order; j += tile_size)
                     {
                         boost::uint64_t i_max = (std::min)(order, i + tile_size);
+                        boost::uint64_t j_max = (std::min)(order, j + tile_size);
+
                         for(boost::uint64_t it = i; it < i_max; ++it)
                         {
-                            boost::uint64_t j_max = (std::min)(order, j + tile_size);
                             for(boost::uint64_t jt = j; jt < j_max; ++jt)
                             {
                                 B[it + order * jt] = A[jt + order * it];
@@ -189,7 +190,8 @@ double test_results(boost::uint64_t order, std::vector<double> const & trans)
                 double errsq = 0.0;
                 for(boost::uint64_t j = 0; j < order; ++j)
                 {
-                    double diff = trans[i * order + j] - (COL_SHIFT*i + ROW_SHIFT * j);
+                    double diff = trans[i * order + j] -
+                        (COL_SHIFT*i + ROW_SHIFT * j);
                     errsq += diff * diff;
                 }
                 return errsq;
diff --git a/examples/transpose/transpose_smp_block.cpp b/examples/transpose/transpose_smp_block.cpp
index 10d3a0d4bce2..bcd6511454c1 100644
--- a/examples/transpose/transpose_smp_block.cpp
+++ b/examples/transpose/transpose_smp_block.cpp
@@ -191,9 +191,10 @@ void transpose(sub_block A, sub_block B, boost::uint64_t block_order,
             for(boost::uint64_t j = 0; j < block_order; j += tile_size)
             {
                 boost::uint64_t i_max = (std::min)(block_order, i + tile_size);
+                boost::uint64_t j_max = (std::min)(block_order, j + tile_size);
+
                 for(boost::uint64_t it = i; it < i_max; ++it)
                 {
-                    boost::uint64_t j_max = (std::min)(block_order, j + tile_size);
                     for(boost::uint64_t jt = j; jt < j_max; ++jt)
                     {
                         B[it + block_order * jt] = A[jt + block_order * it];

From ea083d1f70a0bd14f82d1e92905162e9315e387e Mon Sep 17 00:00:00 2001
From: Hartmut Kaiser <hartmut.kaiser@gmail.com>
Date: Tue, 21 Jul 2015 12:13:22 -0500
Subject: [PATCH 2/4] Fixing transpose_await

---
 examples/transpose/transpose_await.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/transpose/transpose_await.cpp b/examples/transpose/transpose_await.cpp
index 43fe4fd46067..8bd0e75112af 100644
--- a/examples/transpose/transpose_await.cpp
+++ b/examples/transpose/transpose_await.cpp
@@ -177,8 +177,10 @@ double test_results(boost::uint64_t order, boost::uint64_t block_order,
     std::vector<block> & trans, boost::uint64_t blocks_start,
     boost::uint64_t blocks_end);
 
-////////////////////////////////////////////////////////////////////////////////
-hpx::future<void> transpose_phase(
+///////////////////////////////////////////////////////////////////////////////
+// The returned value type has to be the same as the return type used for
+// __await below
+hpx::future<sub_block> transpose_phase(
     std::vector<block> const& A, std::vector<block>& B,
     boost::uint64_t block_order, boost::uint64_t b,
     boost::uint64_t num_blocks, boost::uint64_t num_local_blocks,
@@ -201,6 +203,8 @@ hpx::future<void> transpose_phase(
 
         transpose(__await from, __await to, block_order, tile_size);
     }
+
+    return sub_block();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -312,7 +316,7 @@ int hpx_main(boost::program_options::variables_map& vm)
                 {
                     transpose_phase(A, B, block_order, b,
                         num_blocks, num_local_blocks, block_size, tile_size
-                    );
+                    ).get();
                 });
 
             double elapsed = t.elapsed();

From d133a61d52cf3fb7e5b2ce53a3f84a32a0d34564 Mon Sep 17 00:00:00 2001
From: Hartmut Kaiser <hartmut.kaiser@gmail.com>
Date: Fri, 24 Jul 2015 12:53:01 -0500
Subject: [PATCH 3/4] get_shared_state() has been moved to namespace
 traits::detail

---
 hpx/lcos/local/promise.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpx/lcos/local/promise.hpp b/hpx/lcos/local/promise.hpp
index 52a25123f7d7..d8a8d910a2f4 100644
--- a/hpx/lcos/local/promise.hpp
+++ b/hpx/lcos/local/promise.hpp
@@ -505,7 +505,7 @@ namespace hpx { namespace lcos
         std::experimental::coroutine_handle<Promise> rh)
     {
         // f.then([=](future<T> result) mutable
-        lcos::detail::get_shared_state(f)->set_on_completed(rh);
+        traits::detail::get_shared_state(f)->set_on_completed(rh);
     }
 
     template <typename T>

From 831ed59e695d312c1532c9d747cf8d0398ba61bd Mon Sep 17 00:00:00 2001
From: Thomas Heller <thom.heller@gmail.com>
Date: Fri, 24 Jul 2015 21:21:13 +0200
Subject: [PATCH 4/4] Fixing copy&paste error

---
 examples/transpose/transpose_await.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/transpose/transpose_await.cpp b/examples/transpose/transpose_await.cpp
index 8bd0e75112af..424844e0a016 100644
--- a/examples/transpose/transpose_await.cpp
+++ b/examples/transpose/transpose_await.cpp
@@ -262,7 +262,7 @@ int hpx_main(boost::program_options::variables_map& vm)
         if(root)
         {
             std::cout
-                << "Serial Matrix transpose: B = A^T\n"
+                << "Distributed HPX Matrix transpose (await): B = A^T\n"
                 << "Matrix order          = " << order << "\n"
                 << "Matrix local columns  = " << block_order << "\n"
                 << "Number of blocks      = " << num_blocks << "\n"