STEllAR-GROUP · hkaiser · Oct 1, 2017 · Sep 4, 2017 · Sep 4, 2017 · Sep 5, 2017
@@ -1095,6 +1095,11 @@ if(HPX_WITH_CUDA)
     if(NOT MSVC)
       hpx_library_dir(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
       link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+      #set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};-D_DEBUG;-O0;-g;-G)
+      #set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUDA_NVCC_FLAGS_RELWITHDEBINFO};-DNDEBUG;-O3;-g)
+      #set(CUDA_NVCC_FLAGS_MINSIZEREL ${CUDA_NVCC_FLAGS_MINSIZEREL};-DNDEBUG;-O1)
+      #set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE};-DNDEBUG;-O3)
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-w)
     else()
       set(CUDA_PROPAGATE_HOST_FLAGS OFF)
       hpx_library_dir(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)

@@ -76,9 +76,11 @@ set(HPX_WITH_CUDA_CLANG @HPX_WITH_CUDA_CLANG@)
 set(HPX_CUDA_CLANG_FLAGS @HPX_CUDA_CLANG_FLAGS@)
 
 # Set variables used by nvcc
-if(HPX_WITH_CUDA AND NOT HPX_WITH_CUDA_CLANG)
+if(HPX_WITH_CUDA)
     find_package(CUDA REQUIRED)
+    set(HPX_WITH_CUDA_CLANG @HPX_WITH_CUDA_CLANG@)
     set(CUDA_SEPARABLE_COMPILATION ON)
     set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@)
+    set(HPX_CUDA_CLANG_FLAGS @HPX_CUDA_CLANG_FLAGS@)
     cuda_include_directories("@HPX_CMAKE_CONF_INCLUDE_DIRS@")
 endif()
@@ -9,7 +9,7 @@
 
 #include <hpx/config.hpp>
 
-#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
+#if defined(HPX_HAVE_CUDA)
 #include <hpx/compute/cuda/detail/launch.hpp>
 #include <hpx/compute/cuda/detail/scoped_active_target.hpp>
 #include <hpx/compute/cuda/target.hpp>
@@ -38,7 +38,7 @@ namespace hpx { namespace compute { namespace cuda
         typedef T value_type;
         typedef target_ptr<T> pointer;
         typedef target_ptr<T const> const_pointer;
-#if defined(__CUDA_ARCH__)
+#if defined(HPX_COMPUTE_DEVICE_CODE)
         typedef T& reference;
         typedef T const& const_reference;
 #else
@@ -80,7 +80,7 @@ namespace hpx { namespace compute { namespace cuda
         // operator&
         pointer address(reference x) const noexcept
         {
-#if defined(__CUDA_ARCH__)
+#if defined(HPX_COMPUTE_DEVICE_CODE)
             return &x;
 #else
             return pointer(x.device_ptr(), target_);
@@ -89,7 +89,7 @@ namespace hpx { namespace compute { namespace cuda
 
         const_pointer address(const_reference x) const noexcept
         {
-#if defined(__CUDA_ARCH__)
+#if defined(HPX_COMPUTE_DEVICE_CODE)
             return &x;
 #else
             return pointer(x.device_ptr(), target_);
@@ -104,7 +104,7 @@ namespace hpx { namespace compute { namespace cuda
         pointer allocate(size_type n,
             std::allocator<void>::const_pointer hint = nullptr)
         {
-#if defined(__CUDA_ARCH__)
+#if defined(HPX_COMPUTE_DEVICE_CODE)
             pointer result;
 #else
             value_type *p = nullptr;
@@ -130,7 +130,7 @@ namespace hpx { namespace compute { namespace cuda
         // originally produced p; otherwise, the behavior is undefined.
         void deallocate(pointer p, size_type n)
         {
-#if !defined(__CUDA_ARCH__)
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
             detail::scoped_active_target active(target_);
 
             cudaError_t error = cudaFree(p.device_ptr());
@@ -168,8 +168,9 @@ namespace hpx { namespace compute { namespace cuda
         // Constructs count objects of type T in allocated uninitialized
         // storage pointed to by p, using placement-new
         template <typename ... Args>
-        void bulk_construct(pointer p, std::size_t count, Args &&... args)
+        HPX_HOST_DEVICE void bulk_construct(pointer p, std::size_t count, Args &&... args)
         {
+#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
             int threads_per_block = (std::min)(1024, int(count));
             int num_blocks =
                 int((count + threads_per_block - 1) / threads_per_block);
@@ -186,13 +187,15 @@ namespace hpx { namespace compute { namespace cuda
                 },
                 p.device_ptr(), count, std::forward<Args>(args)...);
             target_.synchronize();
+#endif
         }
 
         // Constructs an object of type T in allocated uninitialized storage
         // pointed to by p, using placement-new
         template <typename ... Args>
-        void construct(pointer p, Args &&... args)
+        HPX_HOST_DEVICE void construct(pointer p, Args &&... args)
         {
+#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
             detail::launch(
                 target_, 1, 1,
                 [] HPX_DEVICE (T* p, Args const&... args)
@@ -201,11 +204,13 @@ namespace hpx { namespace compute { namespace cuda
                 },
                 p.device_ptr(), std::forward<Args>(args)...);
             target_.synchronize();
+#endif
         }
 
         // Calls the destructor of count objects pointed to by p
-        void bulk_destroy(pointer p, std::size_t count)
+        HPX_HOST_DEVICE void bulk_destroy(pointer p, std::size_t count)
         {
+#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
             int threads_per_block = (std::min)(1024, int(count));
             int num_blocks =
                 int((count + threads_per_block) / threads_per_block) - 1;
@@ -222,10 +227,11 @@ namespace hpx { namespace compute { namespace cuda
                 },
                 p.device_ptr(), count);
             target_.synchronize();
+#endif
         }
 
         // Calls the destructor of the object pointed to by p
-        void destroy(pointer p)
+        HPX_HOST_DEVICE void destroy(pointer p)
         {
             bulk_destroy(p, 1);
         }

@@ -8,7 +8,7 @@
 
 #include <hpx/config.hpp>
 
-#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
+#if defined(HPX_HAVE_CUDA)
 #include <hpx/traits/executor_traits.hpp>
 
 #include <hpx/compute/cuda/concurrent_executor_parameters.hpp>

@@ -8,7 +8,7 @@
 
 #include <hpx/config.hpp>
 
-#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
+#if defined(HPX_HAVE_CUDA)// && defined(__CUDACC__)
 #include <hpx/traits/is_executor_parameters.hpp>
 
 #include <cstddef>

@@ -8,7 +8,7 @@
 
 #include <hpx/config.hpp>
 
-#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
+#if defined(HPX_HAVE_CUDA)
 #include <hpx/lcos/future.hpp>
 #include <hpx/traits/executor_traits.hpp>
 #include <hpx/traits/is_executor.hpp>
@@ -17,6 +17,7 @@
 #include <hpx/util/decay.hpp>
 #include <hpx/util/range.hpp>
 #include <hpx/util/tuple.hpp>
+#include <hpx/throw_exception.hpp>
 
 #include <hpx/parallel/executors/execution.hpp>
 
@@ -45,6 +46,7 @@ namespace hpx { namespace compute { namespace cuda
             static void call(cuda::target const& target, F && f,
                 Shape const& shape, Ts &&... ts)
             {
+#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
                 std::size_t count = util::size(shape);
 
                 int threads_per_block =
@@ -73,6 +75,11 @@ namespace hpx { namespace compute { namespace cuda
                     },
                     std::forward<F>(f), shape_container.data(), count,
                     std::forward<Ts>(ts)...);
+#else
+                HPX_THROW_EXCEPTION(hpx::not_implemented,
+                    "hpx::compute::cuda::detail::bulk_launch_helper",
+                    "Trying to launch a CUDA kernel, but did not compile in CUDA mode");
+#endif
             }
         };
 
@@ -89,6 +96,7 @@ namespace hpx { namespace compute { namespace cuda
             static void call(cuda::target const& target, F && f,
                 Shape const& shape, Ts &&... ts)
             {
+#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
                 typedef typename hpx::traits::range_traits<Shape>::value_type
                     value_type;
 
@@ -119,6 +127,11 @@ namespace hpx { namespace compute { namespace cuda
                         std::forward<F>(f), std::forward<Ts>(ts)...
                     );
                 }
+#else
+                HPX_THROW_EXCEPTION(hpx::not_implemented,
+                    "hpx::compute::cuda::detail::bulk_launch_helper",
+                    "Trying to launch a CUDA kernel, but did not compile in CUDA mode");
+#endif
             }
         };
     }

@@ -9,7 +9,7 @@
 
 #include <hpx/config.hpp>
 
-#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
+#if defined(HPX_HAVE_CUDA)
 #include <hpx/traits/is_executor_parameters.hpp>
 
 #include <cstddef>

@@ -63,7 +63,7 @@ namespace hpx { namespace compute { namespace cuda { namespace detail
 
         closure& operator=(closure const&) = delete;
 
-        HPX_DEVICE void operator()()
+        HPX_DEVICE HPX_FORCEINLINE void operator()()
         {
             // FIXME: is it possible to move the arguments?
             hpx::util::invoke_fused_r<void>(f_, args_);
@@ -97,7 +97,7 @@ namespace hpx { namespace compute { namespace cuda { namespace detail
             static_assert(sizeof(Closure) < 256,
                 "We currently require the closure to be less than 256 bytes");
 
-#if !defined(__CUDA_ARCH__)
+#if defined(HPX_COMPUTE_HOST_CODE)
             detail::scoped_active_target active(tgt);
 
             launch_function<<<gridDim, blockDim, 0, active.stream()>>>(
@@ -115,6 +115,9 @@ namespace hpx { namespace compute { namespace cuda { namespace detail
             void *param_buffer = cudaGetParameterBuffer(
                 std::alignment_of<Closure>::value, sizeof(Closure));
             std::memcpy(param_buffer, &c, sizeof(Closure));
+//             cudaLaunchKernel(reinterpret_cast<void*>(launcher),
+//                 dim3(gridDim), dim3(blockDim), param_buffer, 0,
+//                 tgt.native_handle().get_stream());
             cudaLaunchDevice(reinterpret_cast<void*>(launcher), param_buffer,
                 dim3(gridDim), dim3(blockDim), 0, tgt.native_handle().get_stream());
 #endif

@@ -92,34 +92,49 @@ namespace hpx { namespace compute { namespace cuda
 
         // Constructs default target
         HPX_HOST_DEVICE target()
-          : handle_(), locality_(hpx::find_here())
+          : handle_()
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+          , locality_(hpx::find_here())
+#endif
         {}
 
         // Constructs target from a given device ID
         explicit HPX_HOST_DEVICE target(int device)
-          : handle_(device), locality_(hpx::find_here())
+          : handle_(device)
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+          , locality_(hpx::find_here())
+#endif
         {}
 
         HPX_HOST_DEVICE target(hpx::id_type const& locality, int device)
-          : handle_(device), locality_(locality)
+          : handle_(device)
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+          , locality_(locality)
+#endif
         {}
 
         HPX_HOST_DEVICE target(target const& rhs) noexcept
-          : handle_(rhs.handle_),
-            locality_(rhs.locality_)
+          : handle_(rhs.handle_)
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+          , locality_(rhs.locality_)
+#endif
         {}
 
         HPX_HOST_DEVICE target(target && rhs) noexcept
-          : handle_(std::move(rhs.handle_)),
-            locality_(std::move(rhs.locality_))
+          : handle_(std::move(rhs.handle_))
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+          , locality_(std::move(rhs.locality_))
+#endif
         {}
 
         HPX_HOST_DEVICE target& operator=(target const& rhs) noexcept
         {
             if (&rhs != this)
             {
                 handle_ = rhs.handle_;
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
                 locality_ = rhs.locality_;
+#endif
             }
             return *this;
         }
@@ -129,7 +144,9 @@ namespace hpx { namespace compute { namespace cuda
             if (&rhs != this)
             {
                 handle_ = std::move(rhs.handle_);
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
                 locality_ = std::move(rhs.locality_);
+#endif
             }
             return *this;
         }