In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

import numpy as np


N=100
BLOCKSIZE = 256


blockDim  = (BLOCKSIZE, 1, 1)
gridDim   = (N// BLOCKSIZE +1, 1, 1)


a = np.random.randn(N).astype(np.float32)
b = np.random.randn(N).astype(np.float32)
c = np.zeros(N).astype(np.float32)

a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)


mod = SourceModule("""
  __global__ void add(float *a,float *b, float *c, int N)
  {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx < N)
      c[idx] = a[idx] + b[idx];
  }
  """)
prog = mod.get_function("add")

prog(a_gpu, b_gpu,c_gpu, np.uint32(N),block=blockDim,grid=gridDim)

print(c_gpu.get())

[-0.3413977  -0.8673736  -1.2212161   0.23201346 -0.5023408  -1.8088138
 -0.42164916  0.98606944  1.9341152  -0.88601536  2.763887   -0.35668355
  0.4176522   2.3436553  -1.8039987  -2.0589917   0.11854816 -1.07535
 -0.28461075  1.9238496   0.8710293  -0.8657713  -1.610158    1.1943884
 -2.1967857  -0.2109611  -1.2932683   1.900486   -0.27927312  2.358869
 -1.2632034  -2.1440766  -2.8227894  -1.1362706  -0.64406896 -2.1547725
 -2.1537743   1.0122833  -0.35201728 -0.6632788   0.9504212   0.2854757
 -0.36270833 -0.26427156  0.4764403  -0.14816986  0.52196574  1.2513584
 -1.9866517   1.4798868   0.54385275  1.9297974   1.7525442   0.23280084
 -1.3959584  -1.9868478  -1.0130639   1.530908   -1.6170661   0.56127596
  0.3032242   0.29318982 -2.6221852  -1.2955086   0.2475099   0.04702878
  0.925992   -0.8719675   2.1406407   0.2980359  -0.58425075  0.16605598
  0.32950097 -0.4070064   0.02167684 -1.7539539   0.5561815   1.2282352
 -0.5605154   0.07263827  2.9388661  -0.01647425 -0.03634399 -