Skip to content

virtual functions on GPUs

rhaas80 edited this page Nov 30, 2022 · 1 revision

I played around a bit with how one could use virtual functions and function pointers on GPUs. It is supported by nvcc but one must call the constructor on the GPU to make sure that function pointers point to GPU code.

I could not find a really "nice" way to make this work, but did not try very hard either.

Here's three flavors:

just plain virtual functions

// compile using:
// nvcc virt.cu

#include <stdio.h>
#include <stdlib.h>
#include <new>
#include <cuda.h>

class vtest
{
  public:
    virtual __device__ void func(float* a) {
      *a = 42.;
    }
};

__global__
void make_vtest(vtest* p)
{
  if(threadIdx.x == 0) {
    new (p) vtest; 
  }
}

__global__
void use_vtest(vtest* foo, int n, float* y) {
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if(i < n)
    foo->func(&y[i]);
}

int main(void)
{
  const size_t N = 10;
  float y[N];

  vtest* d_foo;
  cudaMalloc(&d_foo, sizeof(vtest)); 
  float* d_y;
  cudaMalloc(&d_y, sizeof(y));

  for(int i = 0 ; i < N ; i++) {
    y[i] = 1.;
  }
  cudaMemcpy(d_y, y, sizeof(y), cudaMemcpyHostToDevice);

  make_vtest<<<1, 1>>>(d_foo);
  cudaDeviceSynchronize();
  use_vtest<<<1, 256>>>(d_foo, N, d_y);

  cudaMemcpy(y, d_y, sizeof(y), cudaMemcpyDeviceToHost);

  for (int i = 0; i < N; i++) {
    printf("y[%d]: %g\n", i, y[i]);
  }

  cudaFree(d_y);
  // should call destructor
  cudaFree(d_foo);
}

using C++ lambda functions

// compile using:
// nvcc -std=c++11 --expt-extended-lambda virtlambda.cu

#include <iostream>

class vtest
{
  public:
    virtual __device__ void func(float* a) const {
      *a = 42.;
    }
};

template <typename Function>
__global__ void call(Function f, float *y) {
  f(y);
}

__global__
void make_vtest(vtest* p)
{
  if(threadIdx.x == 0) {
    new (p) vtest; 
  }
}

vtest* make_vtest_gpu() {
  vtest* d_vtest;
  cudaMalloc(&d_vtest, sizeof(vtest)); 
  make_vtest<<<1, 1>>>(d_vtest);
  cudaDeviceSynchronize();
  return d_vtest;
}

int main(void)
{
   vtest *d_obj = make_vtest_gpu();
   auto fun = [=] __device__ (float *a){
     d_obj->func(a);
     return;
   };

  const size_t N = 1;
  float y[N];

  float* d_y;
  cudaMalloc(&d_y, sizeof(y));

  for(int i = 0 ; i < N ; i++) {
    y[i] = 1.;
  }
  cudaMemcpy(d_y, y, sizeof(y), cudaMemcpyHostToDevice);

  call<<<1, 1>>>(fun, d_y);

  cudaMemcpy(y, d_y, sizeof(y), cudaMemcpyDeviceToHost);

  for (int i = 0; i < N; i++) {
    printf("y[%d]: %g\n", i, y[i]);
  }

  cudaFree(d_y);
}

calling new on the GPU to allocate memory

#include <stdio.h>
#include <stdlib.h>
#include <new>
#include <cuda.h>

class vtest
{
  public:
    virtual __device__ void func(float *a) {
        *a = 42;
    }
};

__global__
void make_vtest(vtest** p)
{
  if(threadIdx.x == 0) {
    *p = new vtest();
  }
}

__global__
void use_vtest(vtest** foo, int n, float* y) {
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if(i < n)
    (*foo)->func(&y[i]);
}

int main(void)
{
  cudaError_t cudaStatus;

  // on AMReX one would use the arena allocator to get shared memory for the
  // pointer storage
  vtest** d_foo = 0;
  cudaMalloc(&d_foo, sizeof(vtest*));
  make_vtest<<<1, 1>>>(d_foo);

  // some storage to test things, AMReX would do this for us
  const size_t N = 10;
  float y[N];
  float* d_y;
  cudaMalloc(&d_y, sizeof(y));

  for(int i = 0 ; i < N ; i++) {
    y[i] = 1.0;
  }
  cudaMemcpy(d_y, y, sizeof(y), cudaMemcpyHostToDevice);

  cudaDeviceSynchronize(); // just to be sure...

  // use our GPU constructed object with virtual member functions
  use_vtest<<<1, 256>>>(d_foo, N, d_y);

  cudaMemcpy(y, d_y, sizeof(y), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++) {
    printf("y[%d]: %g\n", i, y[i]);
  }
}