Lets import the libraries

In [None]:
#!pip install pycuda # install cuda
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

Define the CUDA kernel.

In [None]:
modd = SourceModule("""
__global__ void sequential_reduction(double* xs, int stride, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if(index + stride < size)
  {
    xs[index] += xs[index + stride];
  }
}""")

In [None]:
# Set up tests.

import math
import time
import numpy as np

vector_size = 2**22
value_type = float

numThreadsPerBlock = 1024
numBlocks = math.ceil(vector_size/numThreadsPerBlock)

num_iterations = int(math.log2(vector_size))

# Create the input vector.
a = np.random.randn(vector_size)
a = a.astype(value_type)
a_cpu = a

# Allocate the memory on the GPU and copy the vector.
a_mem_size = a.size * a.dtype.itemsize
a_gpu = cuda.mem_alloc(a_mem_size)
cuda.memcpy_htod(a_gpu, a)

result = np.zeros_like(a)

Now we call the kernel

In [None]:
for i in range(1, num_iterations+1):
  stride = vector_size/2**i
  sum_kernel = modd.get_function("sequential_reduction")
  sum_kernel(a_gpu, np.int32(stride), np.int32(vector_size), block=(numThreadsPerBlock,1,1), grid=(numBlocks,1,1))

Then we can copy back the data and verify the results.

In [None]:
cuda.memcpy_dtoh(result, a_gpu)

a_cpu = np.sum(a_cpu)

np.allclose(a_cpu, result[0], 0.001, 0.001)