Skip to content

Commit

Permalink
Merge pull request #2911 from STEllAR-GROUP/cuda_clang
Browse files Browse the repository at this point in the history
Fixing CUDA problems
  • Loading branch information
hkaiser committed Oct 1, 2017
2 parents a8cd28f + 612e3f8 commit c9f2c09
Show file tree
Hide file tree
Showing 17 changed files with 261 additions and 156 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Expand Up @@ -1099,6 +1099,11 @@ if(HPX_WITH_CUDA)
if(NOT MSVC)
hpx_library_dir(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
#set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};-D_DEBUG;-O0;-g;-G)
#set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUDA_NVCC_FLAGS_RELWITHDEBINFO};-DNDEBUG;-O3;-g)
#set(CUDA_NVCC_FLAGS_MINSIZEREL ${CUDA_NVCC_FLAGS_MINSIZEREL};-DNDEBUG;-O1)
#set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE};-DNDEBUG;-O3)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-w)
else()
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
hpx_library_dir(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
Expand Down
4 changes: 3 additions & 1 deletion cmake/templates/HPXConfig.cmake.in
Expand Up @@ -76,9 +76,11 @@ set(HPX_WITH_CUDA_CLANG @HPX_WITH_CUDA_CLANG@)
set(HPX_CUDA_CLANG_FLAGS @HPX_CUDA_CLANG_FLAGS@)

# Set variables used by nvcc
if(HPX_WITH_CUDA AND NOT HPX_WITH_CUDA_CLANG)
if(HPX_WITH_CUDA)
find_package(CUDA REQUIRED)
set(HPX_WITH_CUDA_CLANG @HPX_WITH_CUDA_CLANG@)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@)
set(HPX_CUDA_CLANG_FLAGS @HPX_CUDA_CLANG_FLAGS@)
cuda_include_directories("@HPX_CMAKE_CONF_INCLUDE_DIRS@")
endif()
26 changes: 16 additions & 10 deletions hpx/compute/cuda/allocator.hpp
Expand Up @@ -9,7 +9,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/compute/cuda/detail/launch.hpp>
#include <hpx/compute/cuda/detail/scoped_active_target.hpp>
#include <hpx/compute/cuda/target.hpp>
Expand Down Expand Up @@ -38,7 +38,7 @@ namespace hpx { namespace compute { namespace cuda
typedef T value_type;
typedef target_ptr<T> pointer;
typedef target_ptr<T const> const_pointer;
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
typedef T& reference;
typedef T const& const_reference;
#else
Expand Down Expand Up @@ -80,7 +80,7 @@ namespace hpx { namespace compute { namespace cuda
// operator&
pointer address(reference x) const noexcept
{
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
return &x;
#else
return pointer(x.device_ptr(), target_);
Expand All @@ -89,7 +89,7 @@ namespace hpx { namespace compute { namespace cuda

const_pointer address(const_reference x) const noexcept
{
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
return &x;
#else
return pointer(x.device_ptr(), target_);
Expand All @@ -104,7 +104,7 @@ namespace hpx { namespace compute { namespace cuda
pointer allocate(size_type n,
std::allocator<void>::const_pointer hint = nullptr)
{
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
pointer result;
#else
value_type *p = nullptr;
Expand All @@ -130,7 +130,7 @@ namespace hpx { namespace compute { namespace cuda
// originally produced p; otherwise, the behavior is undefined.
void deallocate(pointer p, size_type n)
{
#if !defined(__CUDA_ARCH__)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
detail::scoped_active_target active(target_);

cudaError_t error = cudaFree(p.device_ptr());
Expand Down Expand Up @@ -168,8 +168,9 @@ namespace hpx { namespace compute { namespace cuda
// Constructs count objects of type T in allocated uninitialized
// storage pointed to by p, using placement-new
template <typename ... Args>
void bulk_construct(pointer p, std::size_t count, Args &&... args)
HPX_HOST_DEVICE void bulk_construct(pointer p, std::size_t count, Args &&... args)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
int threads_per_block = (std::min)(1024, int(count));
int num_blocks =
int((count + threads_per_block - 1) / threads_per_block);
Expand All @@ -186,13 +187,15 @@ namespace hpx { namespace compute { namespace cuda
},
p.device_ptr(), count, std::forward<Args>(args)...);
target_.synchronize();
#endif
}

// Constructs an object of type T in allocated uninitialized storage
// pointed to by p, using placement-new
template <typename ... Args>
void construct(pointer p, Args &&... args)
HPX_HOST_DEVICE void construct(pointer p, Args &&... args)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
detail::launch(
target_, 1, 1,
[] HPX_DEVICE (T* p, Args const&... args)
Expand All @@ -201,11 +204,13 @@ namespace hpx { namespace compute { namespace cuda
},
p.device_ptr(), std::forward<Args>(args)...);
target_.synchronize();
#endif
}

// Calls the destructor of count objects pointed to by p
void bulk_destroy(pointer p, std::size_t count)
HPX_HOST_DEVICE void bulk_destroy(pointer p, std::size_t count)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
int threads_per_block = (std::min)(1024, int(count));
int num_blocks =
int((count + threads_per_block) / threads_per_block) - 1;
Expand All @@ -222,10 +227,11 @@ namespace hpx { namespace compute { namespace cuda
},
p.device_ptr(), count);
target_.synchronize();
#endif
}

// Calls the destructor of the object pointed to by p
void destroy(pointer p)
HPX_HOST_DEVICE void destroy(pointer p)
{
bulk_destroy(p, 1);
}
Expand Down
2 changes: 1 addition & 1 deletion hpx/compute/cuda/concurrent_executor.hpp
Expand Up @@ -8,7 +8,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/traits/executor_traits.hpp>

#include <hpx/compute/cuda/concurrent_executor_parameters.hpp>
Expand Down
2 changes: 1 addition & 1 deletion hpx/compute/cuda/concurrent_executor_parameters.hpp
Expand Up @@ -8,7 +8,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)// && defined(__CUDACC__)
#include <hpx/traits/is_executor_parameters.hpp>

#include <cstddef>
Expand Down
15 changes: 14 additions & 1 deletion hpx/compute/cuda/default_executor.hpp
Expand Up @@ -8,7 +8,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/lcos/future.hpp>
#include <hpx/traits/executor_traits.hpp>
#include <hpx/traits/is_executor.hpp>
Expand All @@ -17,6 +17,7 @@
#include <hpx/util/decay.hpp>
#include <hpx/util/range.hpp>
#include <hpx/util/tuple.hpp>
#include <hpx/throw_exception.hpp>

#include <hpx/parallel/executors/execution.hpp>

Expand Down Expand Up @@ -45,6 +46,7 @@ namespace hpx { namespace compute { namespace cuda
static void call(cuda::target const& target, F && f,
Shape const& shape, Ts &&... ts)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
std::size_t count = util::size(shape);

int threads_per_block =
Expand Down Expand Up @@ -73,6 +75,11 @@ namespace hpx { namespace compute { namespace cuda
},
std::forward<F>(f), shape_container.data(), count,
std::forward<Ts>(ts)...);
#else
HPX_THROW_EXCEPTION(hpx::not_implemented,
"hpx::compute::cuda::detail::bulk_launch_helper",
"Trying to launch a CUDA kernel, but did not compile in CUDA mode");
#endif
}
};

Expand All @@ -89,6 +96,7 @@ namespace hpx { namespace compute { namespace cuda
static void call(cuda::target const& target, F && f,
Shape const& shape, Ts &&... ts)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
typedef typename hpx::traits::range_traits<Shape>::value_type
value_type;

Expand Down Expand Up @@ -119,6 +127,11 @@ namespace hpx { namespace compute { namespace cuda
std::forward<F>(f), std::forward<Ts>(ts)...
);
}
#else
HPX_THROW_EXCEPTION(hpx::not_implemented,
"hpx::compute::cuda::detail::bulk_launch_helper",
"Trying to launch a CUDA kernel, but did not compile in CUDA mode");
#endif
}
};
}
Expand Down
2 changes: 1 addition & 1 deletion hpx/compute/cuda/default_executor_parameters.hpp
Expand Up @@ -9,7 +9,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/traits/is_executor_parameters.hpp>

#include <cstddef>
Expand Down
7 changes: 5 additions & 2 deletions hpx/compute/cuda/detail/launch.hpp
Expand Up @@ -63,7 +63,7 @@ namespace hpx { namespace compute { namespace cuda { namespace detail

closure& operator=(closure const&) = delete;

HPX_DEVICE void operator()()
HPX_DEVICE HPX_FORCEINLINE void operator()()
{
// FIXME: is it possible to move the arguments?
hpx::util::invoke_fused_r<void>(f_, args_);
Expand Down Expand Up @@ -97,7 +97,7 @@ namespace hpx { namespace compute { namespace cuda { namespace detail
static_assert(sizeof(Closure) < 256,
"We currently require the closure to be less than 256 bytes");

#if !defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_HOST_CODE)
detail::scoped_active_target active(tgt);

launch_function<<<gridDim, blockDim, 0, active.stream()>>>(
Expand All @@ -115,6 +115,9 @@ namespace hpx { namespace compute { namespace cuda { namespace detail
void *param_buffer = cudaGetParameterBuffer(
std::alignment_of<Closure>::value, sizeof(Closure));
std::memcpy(param_buffer, &c, sizeof(Closure));
// cudaLaunchKernel(reinterpret_cast<void*>(launcher),
// dim3(gridDim), dim3(blockDim), param_buffer, 0,
// tgt.native_handle().get_stream());
cudaLaunchDevice(reinterpret_cast<void*>(launcher), param_buffer,
dim3(gridDim), dim3(blockDim), 0, tgt.native_handle().get_stream());
#endif
Expand Down
31 changes: 24 additions & 7 deletions hpx/compute/cuda/target.hpp
Expand Up @@ -92,34 +92,49 @@ namespace hpx { namespace compute { namespace cuda

// Constructs default target
HPX_HOST_DEVICE target()
: handle_(), locality_(hpx::find_here())
: handle_()
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(hpx::find_here())
#endif
{}

// Constructs target from a given device ID
explicit HPX_HOST_DEVICE target(int device)
: handle_(device), locality_(hpx::find_here())
: handle_(device)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(hpx::find_here())
#endif
{}

HPX_HOST_DEVICE target(hpx::id_type const& locality, int device)
: handle_(device), locality_(locality)
: handle_(device)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(locality)
#endif
{}

HPX_HOST_DEVICE target(target const& rhs) noexcept
: handle_(rhs.handle_),
locality_(rhs.locality_)
: handle_(rhs.handle_)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(rhs.locality_)
#endif
{}

HPX_HOST_DEVICE target(target && rhs) noexcept
: handle_(std::move(rhs.handle_)),
locality_(std::move(rhs.locality_))
: handle_(std::move(rhs.handle_))
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(std::move(rhs.locality_))
#endif
{}

HPX_HOST_DEVICE target& operator=(target const& rhs) noexcept
{
if (&rhs != this)
{
handle_ = rhs.handle_;
#if !defined(HPX_COMPUTE_DEVICE_CODE)
locality_ = rhs.locality_;
#endif
}
return *this;
}
Expand All @@ -129,7 +144,9 @@ namespace hpx { namespace compute { namespace cuda
if (&rhs != this)
{
handle_ = std::move(rhs.handle_);
#if !defined(HPX_COMPUTE_DEVICE_CODE)
locality_ = std::move(rhs.locality_);
#endif
}
return *this;
}
Expand Down

0 comments on commit c9f2c09

Please sign in to comment.