In [None]:
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

# Se definen los arreglos
entrada = np.random.randn(25).astype(np.float32)
dest = np.empty_like(entrada)


mod = SourceModule("""
__global__ void double_array(float *dest, float *a)
{
    int idx = blockIdx.x;
    dest[idx] = a[idx] * 2.0f;
}
""")

func = mod.get_function("double_array")

func(drv.Out(dest), drv.In(entrada), block=(25,1,1), grid=(1,1))

print(entrada)

[-1.0313754   0.65717685 -1.381456   -0.29756856  0.9146532  -0.5338191
  1.3453711   0.55165726  0.09954301  0.4195113   0.95001704  1.3733405
 -1.7593514   0.99200207 -1.4499638   2.0484402   1.5329089   0.44167507
  1.3386099  -0.67810965  1.3372496   0.466937    2.2810564  -0.30548105
  1.0308346 ]


In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

entrada = np.random.randn(25).astype(np.float32)
dest = np.empty_like(entrada)

mod = SourceModule("""
__global__ void double_array(float *a)
{
    int idx = threadIdx.x;
    a[idx] *= 2.0f;
}
""")

func = mod.get_function("double_array")

func(a_gpu, block=(25,1,1), grid=(1,1))

# Copiamos el resultado a la CPU
a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)

print("---- 1 BLOQUE × 25 HILOS ----")
print("Original:\n", a)
print("\nDuplicado:\n", a_doubled)


---- 1 BLOQUE × 25 HILOS ----
Original:
 [-0.45953873  0.85616237  0.75467515  1.8713226   0.91803014 -1.2722324
 -1.2070768  -0.76760125  1.0021741  -1.3428775   0.02521371  1.6669264
  0.615425    0.8858349  -0.8747064  -0.75332916  1.2882196   0.22895306
  0.5168961  -0.8477625  -1.2621367   0.46981317  1.1240951  -1.2136093
  1.1445384 ]

Duplicado:
 [-0.91907746  1.7123247   1.5093503   3.7426453   1.8360603  -2.5444648
 -2.4141536  -1.5352025   2.0043483  -2.685755    0.05042742  3.3338528
  1.23085     1.7716697  -1.7494128  -1.5066583   2.5764391   0.45790613
  1.0337923  -1.695525   -2.5242734   0.93962634  2.2481902  -2.4272187
  2.2890768 ]


kernel.cu

  mod = SourceModule("""


In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

a = np.random.randn(625).astype(np.float32)

a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
__global__ void double_array(float *a)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    a[idx] *= 2.0f;
}
""")

func = mod.get_function("double_array")

# 25 bloques, 25 hilos cada uno
func(a_gpu, block=(25,1,1), grid=(25,1))

a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)

print("---- 25 BLOQUES × 25 HILOS ----")
print("Original:\n", a)
print("\nDuplicado:\n", a_doubled)


---- 25 BLOQUES × 25 HILOS ----
Original:
 [ 8.29005241e-01  7.24028051e-01  1.07203782e+00  1.10274303e+00
  1.52961588e+00 -2.44013518e-01  9.22942102e-01  3.98413092e-01
  1.78868449e+00 -2.42511816e-02 -1.97866845e+00 -4.69175160e-01
  1.45614195e+00 -1.59425929e-01 -1.82325304e+00  3.92326385e-01
 -3.07103562e+00  1.36071074e+00  4.36109394e-01  1.04441750e+00
 -3.41558844e-01  1.75846803e+00 -1.07718611e+00 -2.87782878e-01
 -2.13761973e+00 -4.80394542e-01 -1.26479015e-01 -8.55688155e-01
  2.66913325e-01 -1.27807367e+00 -1.51869309e+00 -6.99451506e-01
  1.34244049e+00 -4.71126288e-01 -1.00992739e+00  4.13806230e-01
 -1.28805816e-01 -1.54420674e-01 -1.00927138e+00 -5.09634195e-03
  2.00863525e-01 -5.53936064e-02  4.51615989e-01  1.70798934e+00
  1.22757696e-01 -8.53896081e-01 -5.37023544e-01 -5.52950919e-01
  1.74112487e+00  1.57185292e+00 -2.62243390e-01  1.56253397e+00
 -4.29354161e-01  2.82819092e-01  1.16709101e+00 -7.86704600e-01
  3.88670444e-01 -5.24343193e-01 -9.33020711e-0

kernel.cu

  mod = SourceModule("""
