In [0]:
import numpy as np

In [0]:
def sum_cpu(mat):
  dimi,dimj = mat.shape
  res=np.zeros(dimi)
  for i in range(dimj):
    res[i]=sum(mat[i,:])
  return res

In [0]:
M=np.random.randn(1024,1024)
M=M.astype(np.float32)

In [0]:
%timeit res2=sum_cpu(M)

In [0]:
!pip install pycuda

In [0]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [0]:
kernels=SourceModule("""
__global__ void sumMat(float * Md, float * Mres, int Width)
{
    int tx = threadIdx.x;
    float Pvaleur = 0;
    for (int i = 0; i < Width; i++)
    {
        Pvaleur += Md[tx * Width + i];
    }
    Mres[tx] = Pvaleur;
}
""")

In [0]:
def sum_gpu(mat):
  dimx,dimy = mat.shape
  res=np.zeros(dimy,np.float32)
  # allocation on gpu
  mat_gpu = cuda.mem_alloc(mat.nbytes) 
  res_gpu = cuda.mem_alloc(res.nbytes)
  # copy data on gpu 
  cuda.memcpy_htod(mat_gpu, mat)
  #choose and launch kernel
  matsum = kernels.get_function("sumMat")
  width=np.int32(dimy)
  matsum(mat_gpu,res_gpu,width,block=(dimx,1,1))
  cuda.memcpy_dtoh(res,res_gpu)
  # free on gpu
  mat_gpu.free()
  res_gpu.free()
  return res

In [0]:
%timeit res2=sum_gpu(M)