Skip to content

Commit

Permalink
Futurize the cublas demo and use cudaMalloc/cudaMemcpy instead of all…
Browse files Browse the repository at this point in the history
…ocator
  • Loading branch information
biddisco committed Jan 21, 2017
1 parent dd47ce7 commit 1225b7f
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 90 deletions.
2 changes: 1 addition & 1 deletion examples/compute/cuda/CMakeLists.txt
Expand Up @@ -14,7 +14,7 @@ if(HPX_WITH_CUDA)
partitioned_vector
)
set(cublas_matmul_FLAGS DEPENDENCIES cublas)
set_source_files_properties(cublas_matmul.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
# set_source_files_properties(cublas_matmul.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
set(data_copy_CUDA On)
set(hello_compute_CUDA On)
set(partitioned_vector_CUDA ON)
Expand Down
232 changes: 150 additions & 82 deletions examples/compute/cuda/cublas_matmul.cpp
Expand Up @@ -16,6 +16,16 @@
// NB. The hpx::threads param only controls how many parallel tasks to use for the CPU
// comparison/checks and makes no difference to the GPU execution.
//
// Note: The hpx::compute::cuda::allocator makes use of device code and if used
// this example must be compiled with nvcc instead of c++ which requires the following
// cmake setting
// set_source_files_properties(cublas_matmul.cpp
// PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
// Currently, nvcc does not handle lambda functions properly and it is simpler to use
// cudaMalloc/cudaMemcpy etc, so we do not #define HPX_CUBLAS_DEMO_WITH_ALLOCATOR

#define BOOST_NO_CXX11_ALLOCATOR
//
#include <hpx/hpx.hpp>
#include <hpx/hpx_init.hpp>
#include <hpx/include/parallel_copy.hpp>
Expand All @@ -24,15 +34,21 @@
#include <hpx/include/parallel_executors.hpp>
#include <hpx/include/parallel_executor_parameters.hpp>
//
#include <hpx/compute/cuda/target.hpp>
#include <hpx/compute/cuda/allocator.hpp>
#include <hpx/include/compute.hpp>

#include <hpx/compute/cuda/target.hpp>
#ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
# include <hpx/compute/cuda/allocator.hpp>
#endif
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
//
#include <algorithm>
#include <cstddef>
#include <iostream>
#include <sstream>
#include <utility>
#include <vector>

const char *_cudaGetErrorEnum(cublasStatus_t error);

Expand All @@ -45,8 +61,11 @@ struct cublas_helper
{
public:
using future_type = hpx::future<void>;

#ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
using allocator_type = typename hpx::compute::cuda::allocator<T>;
using vector_type = typename hpx::compute::vector<T, allocator_type>;
#endif

// construct a cublas stream
cublas_helper(std::size_t device=0) : target_(device) {
Expand All @@ -65,7 +84,7 @@ struct cublas_helper
// that you would use for a cublas call except the cublas handle which is omitted
// as the wrapper will supply that for you
template <typename Func, typename ...Args>
void cublas_wrapper(Func && cublas_function, Args&&... args)
void operator()(Func && cublas_function, Args&&... args)
{
// make sure this operation takes place on our stream
return_ = cublasSetStream(handle_, stream_);
Expand All @@ -88,7 +107,15 @@ struct cublas_helper
static void cublas_error(cublasStatus_t err) {
if (err != CUBLAS_STATUS_SUCCESS) {
std::stringstream temp;
temp << "cublasDestroy returned error code " << _cudaGetErrorEnum(err);
temp << "cublas function returned error code " << _cudaGetErrorEnum(err);
throw std::runtime_error(temp.str());
}
}

static void cuda_error(cudaError_t err) {
if (err != cudaSuccess) {
std::stringstream temp;
temp << "cuda function returned error code " << cudaGetErrorString(err);
throw std::runtime_error(temp.str());
}
}
Expand Down Expand Up @@ -140,7 +167,7 @@ inline bool
compare_L2_err(const float *reference, const float *data,
const unsigned int len, const float epsilon)
{
assert(epsilon >= 0);
HPX_ASSERT(epsilon >= 0);

float error = 0;
float ref = 0;
Expand All @@ -166,35 +193,32 @@ compare_L2_err(const float *reference, const float *data,
}

// -------------------------------------------------------------------------
// not all of these are supported by all cuda/cublas versions
// comment them out if they cause compiler errors
const char *_cudaGetErrorEnum(cublasStatus_t error)
{
switch (error)
{
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";

case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";

case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";

case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";

case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";

case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";

case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";

case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}

return "<unknown>";
}

Expand Down Expand Up @@ -222,108 +246,152 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
hpx::parallel::for_each(par, h_B.begin(), h_B.end(), randfunc);

// create a cublas helper object we'll use to futurize the cuda events
cublas_helper<T> cublas(device);

using cublas_future = typename cublas_helper<T>::future_type;

#ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
// for convenience
using device_allocator = typename cublas_helper<T>::allocator_type;
using device_vector = typename cublas_helper<T>::vector_type;
cublas_helper<T> cublas(device);
// The policy used in the parallel algorithms
auto policy = hpx::parallel::execution::par;

// Create a cuda allocator
device_allocator alloc(cublas.target());

// Allocate device memory
device_vector d_A(size_A, alloc);
device_vector d_B(size_B, alloc);
device_vector d_C(size_C, alloc);

// The policy used in the parallel algorithms, just used default for now
auto policy = hpx::parallel::execution::par;
device_vector d_vA(size_A, alloc);
device_vector d_vB(size_B, alloc);
device_vector d_vC(size_C, alloc);

// copy host memory to device
hpx::parallel::copy(policy, h_A.begin(), h_A.end(), d_A.begin());
hpx::parallel::copy(policy, h_B.begin(), h_B.end(), d_B.begin());
hpx::parallel::copy(policy, h_A.begin(), h_A.end(), d_vA.begin());
hpx::parallel::copy(policy, h_B.begin(), h_B.end(), d_vB.begin());

// just to make the rest of code the same for both cases
T *d_A=d_vA.device_data();
T *d_B=d_vB.device_data();
T *d_C=d_vC.device_data();

#else
T *d_A, *d_B, *d_C;
cublas_helper<T>::cuda_error(
cudaMalloc((void **) &d_A, size_A*sizeof(T)));

cublas_helper<T>::cuda_error(
cudaMalloc((void **) &d_B, size_B*sizeof(T)));

cublas_helper<T>::cuda_error(
cudaMalloc((void **) &d_C, size_C*sizeof(T)));

cublas_helper<T>::cuda_error(
cudaMemcpy(d_A, h_A.data(), size_A*sizeof(T), cudaMemcpyHostToDevice));

cublas_helper<T>::cuda_error(
cudaMemcpy(d_B, h_B.data(), size_B*sizeof(T), cudaMemcpyHostToDevice));

#endif

// create and start timer
std::cout << "Computing result using CUBLAS...\n";
// CUBLAS version 2.0
const T alpha = 1.0f;
const T beta = 0.0f;
//

// Perform warmup operation with cublas
// note cublas is column major ordering : transpose the order
//
hpx::util::high_resolution_timer t1;
//
cublas.cublas_wrapper(
std::cout << "calling CUBLAS...\n";
cublas(
&cublasSgemm,
CUBLAS_OP_N, CUBLAS_OP_N,
matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha,
d_B.device_data(), matrix_size.uiWB,
d_A.device_data(), matrix_size.uiWA,
d_B, matrix_size.uiWB,
d_A, matrix_size.uiWA,
&beta,
d_C.device_data(), matrix_size.uiWA);
d_C, matrix_size.uiWA);

// wait until the operation completes
cublas.get_future().get();
// .then(
// [&t1](hpx::future<void> &&f) {

double us1 = t1.elapsed_microseconds();
std::cout << "warmup: elapsed_microseconds " << us1 << std::endl;
// }
// );

// create a second stream for the main calculation
// cublas_helper<T> cublas2(device);
// once the future has been retrieved, the next call to
// get_future will create a new event and a new future so
// we can reuse the same cublas wrapper object and stream if we want

hpx::util::high_resolution_timer t2;
for (int j = 0; j < iterations; j++)
{
cublas.cublas_wrapper(
for (std::size_t j=0; j<iterations; j++) {
cublas(
&cublasSgemm,
CUBLAS_OP_N, CUBLAS_OP_N,
matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha,
d_B.device_data(), matrix_size.uiWB,
d_A.device_data(), matrix_size.uiWA,
d_B, matrix_size.uiWB,
d_A, matrix_size.uiWA,
&beta,
d_C.device_data(), matrix_size.uiWA);
d_C, matrix_size.uiWA);
}

cublas.get_future().get();
double us2 = t2.elapsed_microseconds();
std::cout << "actual: elapsed_microseconds " << us2
<< " iterations " << iterations << std::endl;

// Compute and print the performance
double usecPerMatrixMul = us2 / iterations;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA *
(double)matrix_size.uiHA * (double)matrix_size.uiWB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (usecPerMatrixMul / 1e6);
printf(
"Performance = %.2f GFlop/s, Time = %.3f msec/iteration, Size = %.0f Ops\n",
gigaFlops,
1e-3*usecPerMatrixMul,
flopsPerMatrixMul);

// copy result from device to host
hpx::parallel::copy(policy, d_C.begin(), d_C.end(), h_CUBLAS.begin());

// compute reference solution on the CPU
std::cout << "\nComputing result using host CPU...\n";
// allocate storage for the CPU result
std::vector<T> reference(size_C);

hpx::util::high_resolution_timer t3;
matrixMulCPU<T>(reference.data(), h_A.data(), h_B.data(),
matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
double us3 = t3.elapsed_microseconds();
std::cout << "CPU elapsed_microseconds (1 iteration) " << us3 << std::endl;

// check result (CUBLAS)
bool resCUBLAS = compare_L2_err(reference.data(), h_CUBLAS.data(), size_C, 1.0e-6f);
if (resCUBLAS != true) {
throw std::runtime_error("matrix CPU/GPU comparison error");
}
// if the result was incorrect, we throw an exception, so here it's ok
std::cout << "\nComparing CUBLAS Matrix Multiply with CPU results: OK \n";
// attach a continuation to the cublas future
auto new_future = cublas.get_future().then([&](cublas_future &&f)
{
double us2 = t2.elapsed_microseconds();
std::cout << "actual: elapsed_microseconds " << us2
<< " iterations " << iterations << std::endl;

// Compute and print the performance
double usecPerMatrixMul = us2 / iterations;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA *
(double)matrix_size.uiHA * (double)matrix_size.uiWB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (usecPerMatrixMul / 1e6);
printf(
"Performance = %.2f GFlop/s, Time = %.3f msec/iter, Size = %.0f Ops\n",
gigaFlops,
1e-3*usecPerMatrixMul,
flopsPerMatrixMul);
}
);

// wait for the timing to complete, and then do a CPU comparison
auto finished = new_future.then([&](cublas_future &&f) {
// compute reference solution on the CPU
std::cout << "\nComputing result using host CPU...\n";
#ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
// copy result from device to host
hpx::parallel::copy(policy, d_C.begin(), d_C.end(), h_CUBLAS.begin());
#else
cublas_helper<T>::cuda_error(
cudaMemcpy(h_CUBLAS.data(), d_C, size_C*sizeof(T), cudaMemcpyDeviceToHost));
#endif

// compute reference solution on the CPU
// allocate storage for the CPU result
std::vector<T> reference(size_C);

hpx::util::high_resolution_timer t3;
matrixMulCPU<T>(reference.data(), h_A.data(), h_B.data(),
matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
double us3 = t3.elapsed_microseconds();
std::cout << "CPU elapsed_microseconds (1 iteration) " << us3 << std::endl;

// check result (CUBLAS)
bool resCUBLAS = compare_L2_err(reference.data(), h_CUBLAS.data(), size_C, 1e-6);
if (resCUBLAS != true) {
throw std::runtime_error("matrix CPU/GPU comparison error");
}
// if the result was incorrect, we throw an exception, so here it's ok
std::cout << "\nComparing CUBLAS Matrix Multiply with CPU results: OK \n";
});

finished.get();
#ifndef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
#endif
}

// -------------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions hpx/compute/cuda/target.hpp
Expand Up @@ -82,8 +82,8 @@ namespace hpx { namespace compute { namespace cuda
void init_processing_units();
friend struct target;

mutable mutex_type mtx_;
int device_;
mutable mutex_type mtx_;
int device_;
std::size_t processing_units_;
std::size_t processor_family_;
std::string processor_name_;
Expand Down

0 comments on commit 1225b7f

Please sign in to comment.