In [0]:
!nvcc --version
!gcc --version
!g++ --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [0]:
!sudo apt-get install python-numpy -y
!sudo apt-get install build-essential python-dev python-setuptools libboost-python-dev libboost-thread-dev -y
!apt-cache search pycuda
!apt install libnvidia-compute-390 python3-pycuda

In [0]:
! apt --fix-broken install

In [0]:
!apt-cache search pycuda
!apt-get install python3-pycuda


In [0]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import sys

np.set_printoptions(threshold=sys.maxsize)


ELEM_NUMBER = 1024
BLOCK_SIZE = 256

A = np.random.randint(0, 2, dtype=np.int, size=ELEM_NUMBER)
print (A[-3:])
B = np.random.randint(0, 2, dtype=np.int, size=ELEM_NUMBER)
print (B[-3:])

A_gpu = cuda.mem_alloc(A.nbytes)
B_gpu = cuda.mem_alloc(B.nbytes)
C_gpu = cuda.mem_alloc(A.nbytes)


cuda.memcpy_htod(A_gpu, A)
cuda.memcpy_htod(B_gpu, B)

mod = SourceModule("""
    __global__ void vecAdd(long *A, long *B, long *result) {

      unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
      result[idx] = A[idx] + B[idx];
    }
  """)

func = mod.get_function("vecAdd")


gridDim = ( int((A.size + BLOCK_SIZE - 1)/BLOCK_SIZE),1)
blockDim = (BLOCK_SIZE,1,1)
print("block: "+str(blockDim)+" grid: "+str(gridDim))

func(A_gpu, B_gpu, C_gpu, block=blockDim, grid=gridDim)

C = np.empty_like(A)
cuda.memcpy_dtoh(C, C_gpu)
print (C[-3:])

print(A.sum()+B.sum())
print(C.sum())




In [0]:
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy  as np

NUMBER_ELEMENT = 1024
BLOCK_SIZE = 256

A = np.random.randint(0, 2, dtype=np.int, size=NUMBER_ELEMENT)
print (A)

B = np.random.randint(0, 2, dtype=np.int, size=NUMBER_ELEMENT)
print (B)

A_gpu = gpuarray.to_gpu(A)
print (A_gpu.get())

B_gpu = gpuarray.to_gpu(B)
print (B_gpu.get())

C_gpu = gpuarray.to_gpu(np.empty(NUMBER_ELEMENT).astype(np.int))
print (C_gpu.get())

mod = SourceModule("""
    __global__ void vecAdd(long *A, long *B, long *result) {

      unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
      result[idx] = A[idx] + B[idx];
    }
  """)

gridDim = ( int((A.size + BLOCK_SIZE - 1)/BLOCK_SIZE),1)
blockDim = (BLOCK_SIZE,1,1)
print("block: "+str(blockDim)+" grid: "+str(gridDim))

func = mod.get_function("vecAdd")
func(A_gpu, B_gpu, C_gpu, block=blockDim, grid=gridDim)

result = C_gpu.get()
print (result)

print(A.sum()+B.sum())
print(C.sum())
