Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing CUDA problems #2911

Merged
merged 16 commits into from Oct 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Expand Up @@ -1095,6 +1095,11 @@ if(HPX_WITH_CUDA)
if(NOT MSVC)
hpx_library_dir(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
#set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};-D_DEBUG;-O0;-g;-G)
#set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUDA_NVCC_FLAGS_RELWITHDEBINFO};-DNDEBUG;-O3;-g)
#set(CUDA_NVCC_FLAGS_MINSIZEREL ${CUDA_NVCC_FLAGS_MINSIZEREL};-DNDEBUG;-O1)
#set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE};-DNDEBUG;-O3)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-w)
else()
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
hpx_library_dir(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
Expand Down
4 changes: 3 additions & 1 deletion cmake/templates/HPXConfig.cmake.in
Expand Up @@ -76,9 +76,11 @@ set(HPX_WITH_CUDA_CLANG @HPX_WITH_CUDA_CLANG@)
set(HPX_CUDA_CLANG_FLAGS @HPX_CUDA_CLANG_FLAGS@)

# Set variables used by nvcc
if(HPX_WITH_CUDA AND NOT HPX_WITH_CUDA_CLANG)
if(HPX_WITH_CUDA)
find_package(CUDA REQUIRED)
set(HPX_WITH_CUDA_CLANG @HPX_WITH_CUDA_CLANG@)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@)
set(HPX_CUDA_CLANG_FLAGS @HPX_CUDA_CLANG_FLAGS@)
cuda_include_directories("@HPX_CMAKE_CONF_INCLUDE_DIRS@")
endif()
26 changes: 16 additions & 10 deletions hpx/compute/cuda/allocator.hpp
Expand Up @@ -9,7 +9,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/compute/cuda/detail/launch.hpp>
#include <hpx/compute/cuda/detail/scoped_active_target.hpp>
#include <hpx/compute/cuda/target.hpp>
Expand Down Expand Up @@ -38,7 +38,7 @@ namespace hpx { namespace compute { namespace cuda
typedef T value_type;
typedef target_ptr<T> pointer;
typedef target_ptr<T const> const_pointer;
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
typedef T& reference;
typedef T const& const_reference;
#else
Expand Down Expand Up @@ -80,7 +80,7 @@ namespace hpx { namespace compute { namespace cuda
// operator&
pointer address(reference x) const noexcept
{
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
return &x;
#else
return pointer(x.device_ptr(), target_);
Expand All @@ -89,7 +89,7 @@ namespace hpx { namespace compute { namespace cuda

const_pointer address(const_reference x) const noexcept
{
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
return &x;
#else
return pointer(x.device_ptr(), target_);
Expand All @@ -104,7 +104,7 @@ namespace hpx { namespace compute { namespace cuda
pointer allocate(size_type n,
std::allocator<void>::const_pointer hint = nullptr)
{
#if defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_DEVICE_CODE)
pointer result;
#else
value_type *p = nullptr;
Expand All @@ -130,7 +130,7 @@ namespace hpx { namespace compute { namespace cuda
// originally produced p; otherwise, the behavior is undefined.
void deallocate(pointer p, size_type n)
{
#if !defined(__CUDA_ARCH__)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
detail::scoped_active_target active(target_);

cudaError_t error = cudaFree(p.device_ptr());
Expand Down Expand Up @@ -168,8 +168,9 @@ namespace hpx { namespace compute { namespace cuda
// Constructs count objects of type T in allocated uninitialized
// storage pointed to by p, using placement-new
template <typename ... Args>
void bulk_construct(pointer p, std::size_t count, Args &&... args)
HPX_HOST_DEVICE void bulk_construct(pointer p, std::size_t count, Args &&... args)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
int threads_per_block = (std::min)(1024, int(count));
int num_blocks =
int((count + threads_per_block - 1) / threads_per_block);
Expand All @@ -186,13 +187,15 @@ namespace hpx { namespace compute { namespace cuda
},
p.device_ptr(), count, std::forward<Args>(args)...);
target_.synchronize();
#endif
}

// Constructs an object of type T in allocated uninitialized storage
// pointed to by p, using placement-new
template <typename ... Args>
void construct(pointer p, Args &&... args)
HPX_HOST_DEVICE void construct(pointer p, Args &&... args)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
detail::launch(
target_, 1, 1,
[] HPX_DEVICE (T* p, Args const&... args)
Expand All @@ -201,11 +204,13 @@ namespace hpx { namespace compute { namespace cuda
},
p.device_ptr(), std::forward<Args>(args)...);
target_.synchronize();
#endif
}

// Calls the destructor of count objects pointed to by p
void bulk_destroy(pointer p, std::size_t count)
HPX_HOST_DEVICE void bulk_destroy(pointer p, std::size_t count)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
int threads_per_block = (std::min)(1024, int(count));
int num_blocks =
int((count + threads_per_block) / threads_per_block) - 1;
Expand All @@ -222,10 +227,11 @@ namespace hpx { namespace compute { namespace cuda
},
p.device_ptr(), count);
target_.synchronize();
#endif
}

// Calls the destructor of the object pointed to by p
void destroy(pointer p)
HPX_HOST_DEVICE void destroy(pointer p)
{
bulk_destroy(p, 1);
}
Expand Down
2 changes: 1 addition & 1 deletion hpx/compute/cuda/concurrent_executor.hpp
Expand Up @@ -8,7 +8,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/traits/executor_traits.hpp>

#include <hpx/compute/cuda/concurrent_executor_parameters.hpp>
Expand Down
2 changes: 1 addition & 1 deletion hpx/compute/cuda/concurrent_executor_parameters.hpp
Expand Up @@ -8,7 +8,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)// && defined(__CUDACC__)
#include <hpx/traits/is_executor_parameters.hpp>

#include <cstddef>
Expand Down
15 changes: 14 additions & 1 deletion hpx/compute/cuda/default_executor.hpp
Expand Up @@ -8,7 +8,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/lcos/future.hpp>
#include <hpx/traits/executor_traits.hpp>
#include <hpx/traits/is_executor.hpp>
Expand All @@ -17,6 +17,7 @@
#include <hpx/util/decay.hpp>
#include <hpx/util/range.hpp>
#include <hpx/util/tuple.hpp>
#include <hpx/throw_exception.hpp>

#include <hpx/parallel/executors/execution.hpp>

Expand Down Expand Up @@ -45,6 +46,7 @@ namespace hpx { namespace compute { namespace cuda
static void call(cuda::target const& target, F && f,
Shape const& shape, Ts &&... ts)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
std::size_t count = util::size(shape);

int threads_per_block =
Expand Down Expand Up @@ -73,6 +75,11 @@ namespace hpx { namespace compute { namespace cuda
},
std::forward<F>(f), shape_container.data(), count,
std::forward<Ts>(ts)...);
#else
HPX_THROW_EXCEPTION(hpx::not_implemented,
"hpx::compute::cuda::detail::bulk_launch_helper",
"Trying to launch a CUDA kernel, but did not compile in CUDA mode");
#endif
}
};

Expand All @@ -89,6 +96,7 @@ namespace hpx { namespace compute { namespace cuda
static void call(cuda::target const& target, F && f,
Shape const& shape, Ts &&... ts)
{
#if defined(HPX_COMPUTE_DEVICE_CODE) || defined(HPX_COMPUTE_HOST_CODE)
typedef typename hpx::traits::range_traits<Shape>::value_type
value_type;

Expand Down Expand Up @@ -119,6 +127,11 @@ namespace hpx { namespace compute { namespace cuda
std::forward<F>(f), std::forward<Ts>(ts)...
);
}
#else
HPX_THROW_EXCEPTION(hpx::not_implemented,
"hpx::compute::cuda::detail::bulk_launch_helper",
"Trying to launch a CUDA kernel, but did not compile in CUDA mode");
#endif
}
};
}
Expand Down
2 changes: 1 addition & 1 deletion hpx/compute/cuda/default_executor_parameters.hpp
Expand Up @@ -9,7 +9,7 @@

#include <hpx/config.hpp>

#if defined(HPX_HAVE_CUDA) && defined(__CUDACC__)
#if defined(HPX_HAVE_CUDA)
#include <hpx/traits/is_executor_parameters.hpp>

#include <cstddef>
Expand Down
7 changes: 5 additions & 2 deletions hpx/compute/cuda/detail/launch.hpp
Expand Up @@ -63,7 +63,7 @@ namespace hpx { namespace compute { namespace cuda { namespace detail

closure& operator=(closure const&) = delete;

HPX_DEVICE void operator()()
HPX_DEVICE HPX_FORCEINLINE void operator()()
{
// FIXME: is it possible to move the arguments?
hpx::util::invoke_fused_r<void>(f_, args_);
Expand Down Expand Up @@ -97,7 +97,7 @@ namespace hpx { namespace compute { namespace cuda { namespace detail
static_assert(sizeof(Closure) < 256,
"We currently require the closure to be less than 256 bytes");

#if !defined(__CUDA_ARCH__)
#if defined(HPX_COMPUTE_HOST_CODE)
detail::scoped_active_target active(tgt);

launch_function<<<gridDim, blockDim, 0, active.stream()>>>(
Expand All @@ -115,6 +115,9 @@ namespace hpx { namespace compute { namespace cuda { namespace detail
void *param_buffer = cudaGetParameterBuffer(
std::alignment_of<Closure>::value, sizeof(Closure));
std::memcpy(param_buffer, &c, sizeof(Closure));
// cudaLaunchKernel(reinterpret_cast<void*>(launcher),
// dim3(gridDim), dim3(blockDim), param_buffer, 0,
// tgt.native_handle().get_stream());
cudaLaunchDevice(reinterpret_cast<void*>(launcher), param_buffer,
dim3(gridDim), dim3(blockDim), 0, tgt.native_handle().get_stream());
#endif
Expand Down
31 changes: 24 additions & 7 deletions hpx/compute/cuda/target.hpp
Expand Up @@ -92,34 +92,49 @@ namespace hpx { namespace compute { namespace cuda

// Constructs default target
HPX_HOST_DEVICE target()
: handle_(), locality_(hpx::find_here())
: handle_()
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(hpx::find_here())
#endif
{}

// Constructs target from a given device ID
explicit HPX_HOST_DEVICE target(int device)
: handle_(device), locality_(hpx::find_here())
: handle_(device)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(hpx::find_here())
#endif
{}

HPX_HOST_DEVICE target(hpx::id_type const& locality, int device)
: handle_(device), locality_(locality)
: handle_(device)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(locality)
#endif
{}

HPX_HOST_DEVICE target(target const& rhs) noexcept
: handle_(rhs.handle_),
locality_(rhs.locality_)
: handle_(rhs.handle_)
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(rhs.locality_)
#endif
{}

HPX_HOST_DEVICE target(target && rhs) noexcept
: handle_(std::move(rhs.handle_)),
locality_(std::move(rhs.locality_))
: handle_(std::move(rhs.handle_))
#if !defined(HPX_COMPUTE_DEVICE_CODE)
, locality_(std::move(rhs.locality_))
#endif
{}

HPX_HOST_DEVICE target& operator=(target const& rhs) noexcept
{
if (&rhs != this)
{
handle_ = rhs.handle_;
#if !defined(HPX_COMPUTE_DEVICE_CODE)
locality_ = rhs.locality_;
#endif
}
return *this;
}
Expand All @@ -129,7 +144,9 @@ namespace hpx { namespace compute { namespace cuda
if (&rhs != this)
{
handle_ = std::move(rhs.handle_);
#if !defined(HPX_COMPUTE_DEVICE_CODE)
locality_ = std::move(rhs.locality_);
#endif
}
return *this;
}
Expand Down