In [5]:
import pyopencl as cl
import numpy
import numpy.linalg as la
import datetime
from time import time

a = numpy.random.rand(10000000).astype(numpy.float32)
b = numpy.random.rand(10000000).astype(numpy.float32)
c_result = numpy.empty_like(a)

# CPU usage
t1 = time()
for i in range(1000):
        for j in range(1000):
                c_result[i] = a[i] + b[i]
                c_result[i] = c_result[i] * (a[i] + b[i])
                c_result[i] = c_result[i] * (a[i] / 99.0)
t2 = time()
print ("Execution time using CPU: ", t2 - t1, "s")

for platform in cl.get_platforms():
    for device in platform.get_devices():
		# Using GPU
      ctx = cl.Context([device])
      queue = cl.CommandQueue(ctx, 
      properties=cl.command_queue_properties.PROFILING_ENABLE)

      mf = cl.mem_flags
      a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
      b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
      dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

      prg = cl.Program(ctx, """
        __kernel void sum(__global const float *a,
        __global const float *b, __global float *c)
          {
          int loop;
          int gid = get_global_id(0);
          for(loop=0; loop<1000;loop++)
            {
            c[gid] = a[gid] + b[gid];
            c[gid] = c[gid] * (a[gid] + b[gid]);
            c[gid] = c[gid] * (a[gid] / 99.0);
            }
          }
          """).build()

      exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
      exec_evt.wait()
      t_gpu = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)

      print ("Execution time using PyopenCL: %g s" % t_gpu)

      c = numpy.empty_like(a)
      cl.enqueue_copy(queue, dest_buf, c).wait()
      flag = 0
      for i in range(1000):
        if c[i] != c_result[i]:
         flag = 1
      
      if flag:
          print ("GPU execution is faster")
      else:
          print ("CPU execution is faster")


Platform name: NVIDIA CUDA
Platform profile: FULL_PROFILE
Platform vendor: NVIDIA Corporation
Platform version: OpenCL 1.2 CUDA 10.1.152
Device name: Tesla T4
Device type: GPU
Device memory:  15079 MB
Device max clock speed: 1590 MHz
Device compute units: 40
