In [None]:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit

from pycuda.elementwise import ElementwiseKernel

# define elementwise `add()` function
add = ElementwiseKernel(
        "float *a, float *b, float *c",
        "c[i] = a[i] + b[i]",
        "add")

# create a couple of random matrices with a given shape
from pycuda.curandom import rand as curand
shape = 128, 1024
a_gpu = curand(shape)
b_gpu = curand(shape)

print a_gpu
print b_gpu

# compute sum on a gpu
c_gpu = gpuarray.empty_like(a_gpu)
add(a_gpu, b_gpu, c_gpu)
print c_gpu

# check the result
import numpy.linalg as la
print (c_gpu - (a_gpu + b_gpu))
assert la.norm((c_gpu - (a_gpu + b_gpu)).get()) < 1e-5

In [5]:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy as np
import skcuda.misc as misc
from pycuda.elementwise import ElementwiseKernel

# define elementwise `square()` function
square = ElementwiseKernel(
        "float *a, float *b",
        "b[i] = a[i]*a[i]",
        "square")

misc.init()

# create a couple of random matrices with a given shape
# from pycuda.curandom import rand as curand
# shape = 8, 16
# a_gpu = curand(shape)
dt1=np.dtype(np.float32)
a_gpu=gpuarray.to_gpu(np.array([[ 1., 2. ],[ 2., 1. ],[ 1., 0. ],[ 0., 1. ]], dtype=dt1))
print a_gpu

# compute sum on a gpu
b_gpu = gpuarray.empty_like(a_gpu)
square(a_gpu, b_gpu)
print b_gpu

gpu_sum = gpuarray.to_gpu(np.array([[ 0. ],[ 0. ],[ 0. ],[ 0. ]], dtype=dt1))
print gpu_sum

gpu_sum=misc.sum(b_gpu,axis=1,keepdims=True)

print gpu_sum



[[ 1.  2.]
 [ 2.  1.]
 [ 1.  0.]
 [ 0.  1.]]
[[ 1.  4.]
 [ 4.  1.]
 [ 1.  0.]
 [ 0.  1.]]
[[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]
[[ 5.]
 [ 5.]
 [ 1.]
 [ 1.]]
