In [208]:
#!pip install pycuda # install cuda
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
from timeit import default_timer as timer

In [209]:
# CUDA kernel
modd = SourceModule("""
__global__ void times_two(const double* A, double* B)
      {
      int index = blockIdx.x * blockDim.x + threadIdx.x;
      B[index] = A[index] * 2;
      }
  """)

In [210]:

import time

# Create the input vectors.
n = 10
a = np.random.randn(n)
a = a.astype(float)

# Allocate the memory on the GPU and copy the vectors.

a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize) 
cuda.memcpy_htod(a_gpu, a)

b = np.empty_like(a)
b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize) # WHY NO NEED TO memcpy_htod HERE?

# Call the CUDA kernel.

vec_add = modd.get_function("times_two")

start_time_gpu = timer()
vec_add(a_gpu, b_gpu, block=(10, 1, 1), grid=(1, 1, 1))
time_gpu = timer() - start_time_gpu

# Copy the result back to the host.

cuda.memcpy_dtoh(b, b_gpu)

a_gpu.free()
b_gpu.free()


# Do same calculation in CPU.
start_time_cpu = timer()
b_cpu = a * 2
time_cpu = timer() - start_time_cpu

  # Verify the result
print(b)
print(b_cpu)
if (b_cpu == b).all():
  print("Both vectors are the same.")
else:
  print("Vectors are not equal, something went wrong.")

print( "GPU: " + str( time_gpu ) )
print( "CPU: " + str( time_cpu ) )

[-0.01805789  3.101486    1.53857938  3.99845852  1.66500995  1.83813516
  1.11465302  1.34364325  1.44129155 -0.02258287]
[-0.01805789  3.101486    1.53857938  3.99845852  1.66500995  1.83813516
  1.11465302  1.34364325  1.44129155 -0.02258287]
Both vectors are the same.
GPU: 8.89999937498942e-05
CPU: 5.649999366141856e-05
