This is an addition of two vectors in python.

In [1]:
import numpy as np
from timeit import default_timer as timer

def VectorAdd(a, b, c):
  for i in range(a.size):
    c[i] = a[i] + b[i]

def main():
  N = 32000000 # Number of elements per Array

  A = np.ones(N, dtype=np.float32)
  B = np.ones(N, dtype=np.float32)
  C = np.zeros(N, dtype=np.float32)

  start = timer()
  VectorAdd(A, B, C)
  vectoradd_time = timer() - start
  
  print("C[:5] = " + str(C[:5]))
  print("C[-5:] = " + str(C[-5:]))

  print("Vector Add took %f seconds" % vectoradd_time)

if __name__ == '__main__':
  main()

C[:5] = [ 2.  2.  2.  2.  2.]
C[-5:] = [ 2.  2.  2.  2.  2.]
Vector Add took 6.255466 seconds


This is an addition of two vectors in numba.

In [1]:
import numpy as np
from timeit import default_timer as timer
from numba import vectorize

import os

@vectorize(["float32(float32, float32)"], target='cuda')
def VectorAdd(a, b):
  return a + b

def main():
  
  os.environ['NUMBAPRO_NVVM']="/usr/lib/x86_64-linux-gnu/libnvvm.so"
  os.environ['NUMBAPRO_LIBDEVICE']="/usr/lib/nvidia-cuda-toolkit/libdevice"
    
  N = 32000000 # Number of elements per Array

  A = np.ones(N, dtype=np.float32)
  B = np.ones(N, dtype=np.float32)
  C = np.zeros(N, dtype=np.float32)

  start = timer()
  C = VectorAdd(A, B)
  vectoradd_time = timer() - start
  
  print("C[:5] = " + str(C[:5]))
  print("C[-5:] = " + str(C[-5:]))

  print("Vector Add took %f seconds" % vectoradd_time)

if __name__ == '__main__':
  main()

C[:5] = [ 2.  2.  2.  2.  2.]
C[-5:] = [ 2.  2.  2.  2.  2.]
Vector Add took 0.258566 seconds


This is an addition of two vectors in pycuda.

In [3]:
import numpy as np
from timeit import default_timer as timer
import pycuda.autoinit
import pycuda.gpuarray as gpuarray

def main():
  N = 32000000 # Number of elements per Array

  A = gpuarray.to_gpu(np.ones(N, dtype=np.float32))
  B = gpuarray.to_gpu(np.ones(N, dtype=np.float32))

  start = timer()
  C = (A + B).get()
  vectoradd_time = timer() - start
    
  print("C[:5] = " + str(C[:5]))
  print("C[-5:] = " + str(C[-5:]))

  print("Vector Add took %f seconds" % vectoradd_time)
    
if __name__ == '__main__':
  main()

C[:5] = [ 2.  2.  2.  2.  2.]
C[-5:] = [ 2.  2.  2.  2.  2.]
Vector Add took 0.194794 seconds


This one is from tutorial

In [52]:
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy

a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
a_doubled = (2 * a_gpu).get()
print(a_doubled)
print(a_gpu)

[ 2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


This one failed, how to determine blocksize?

In [55]:
import numpy as np
from timeit import default_timer as timer
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
    
def main():
  N = 32000000 # Number of elements per Array

  A = np.ones(N, dtype=np.float32)
  B = np.ones(N, dtype=np.float32)
  C = np.zeros(N, dtype=np.float32)
    
  A_gpu = cuda.mem_alloc(A.nbytes)
  B_gpu = cuda.mem_alloc(B.nbytes)
  C_gpu = cuda.mem_alloc(C.nbytes)
    
  cuda.memcpy_htod(A_gpu, A)
  cuda.memcpy_htod(B_gpu, B)
  cuda.memcpy_htod(C_gpu, C)

  mod = SourceModule("""__global__ void VectorAdd(float *a, float *b, float *c, int n) 
  {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i < n);
      c[i] = a[i] + b[i];
  }
  """)

  start = timer()
  func = mod.get_function("VectorAdd")
  func(A_gpu, B_gpu, C_gpu, N, block=(4,1,1), grid=(1,1))
  s = np.empty_like(C)
  vectoradd_time = timer() - start
  
  cuda.memcpy_dtoh(s, C_gpu)

  print("C[:5] = " + str(s[:5]))
  print("C[-5:] = " + str(s[-5:]))

  print("Vector Add took %f seconds" % vectoradd_time)

if __name__ == '__main__':
  main()

TypeError: invalid type on parameter #3 (0-based)

In [3]:
from ctypes import *

lib1 = cdll.LoadLibrary('/usr/local/cuida-8.0/cudnn6/lib64/libcudnn.so')

import tensorflow as tf

  return f(*args, **kwds)
