In [None]:
import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule
TILE_DIM = 16
# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
matmul_kernel = mod.get_function("MatMul")
def test_matmul():
    # Dimensions des matrices
    ARows, ACols = , 128
    BRows, BCols = 128, 64

    # Création des matrices A et B aléatoires
    A = np.random.rand(ARows, ACols).astype(np.float32)
    B = np.random.rand(BRows, BCols).astype(np.float32)

    # Matrice résultat C
    C = np.zeros((ARows, BCols), dtype=np.float32)

    # Allocation mémoire GPU
    A_gpu = cuda.mem_alloc(A.nbytes)
    B_gpu = cuda.mem_alloc(B.nbytes)
    C_gpu = cuda.mem_alloc(C.nbytes)

    # Copie des données vers le GPU
    cuda.memcpy_htod(A_gpu, A)
    cuda.memcpy_htod(B_gpu, B)

    # Dimensions du bloc et de la grille
    block = (TILE_DIM, TILE_DIM, 1)
    grid = ((BCols + TILE_DIM - 1) // TILE_DIM, (ARows + TILE_DIM - 1) // TILE_DIM)

    # Lancer le kernel
    matmul_kernel(
        A_gpu, B_gpu, C_gpu,
        np.int32(ARows), np.int32(ACols), np.int32(BCols),
        block=block, grid=grid
    )

    # Copier le résultat du GPU vers le CPU
    cuda.memcpy_dtoh(C, C_gpu)

    # Validation avec NumPy
    C_reference = np.dot(A, B)

    # Comparaison
    if np.allclose(C, C_reference, atol=1e-5):
        print("Succès : les résultats correspondent.")
    else:
        print("Échec : les résultats ne correspondent pas.")

    print (C)
if __name__ == "__main__":
    test_matmul()


Succès : les résultats correspondent.
[[32.06255  31.75959  32.929203 ... 32.913933 35.47173  31.566277]
 [30.625723 30.626526 29.449003 ... 32.03976  34.82396  30.636587]
 [32.023018 31.007334 29.233303 ... 33.957962 35.235214 29.332048]
 ...
 [30.753458 30.601162 29.704533 ... 32.029663 34.53249  29.389482]
 [31.237335 30.836004 30.722458 ... 33.61748  34.23421  29.546354]
 [32.609024 31.256659 32.494377 ... 33.186035 36.56704  32.060577]]


In [7]:


import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
transpose = mod.get_function("transpose")

# Dimensions de la matrice
nx, ny = 8, 5
matrix = np.arange(nx * ny, dtype=np.float32).reshape(nx, ny)

# Matrice d'entrée et de sortie
input_matrix = np.array(matrix, dtype=np.float32)
output_matrix = np.zeros_like(input_matrix.T)

# Allocation de mémoire GPU
input_gpu = drv.mem_alloc(input_matrix.nbytes)
output_gpu = drv.mem_alloc(output_matrix.nbytes)

# Copie des données vers le GPU
drv.memcpy_htod(input_gpu, input_matrix)

# Dimensions des blocs et de la grille
block = (8, 8, 1)
grid = ((nx + block[0] - 1) // block[0], (ny + block[1] - 1) // block[1])

# Exécution du kernel
transpose(input_gpu, output_gpu, np.uint32(nx), np.uint32(ny), block=block, grid=grid)

# Copie du résultat vers le CPU
drv.memcpy_dtoh(output_matrix, output_gpu)

# Affichage des résultats
print("Input Matrix:")
print(input_matrix)
print("Transposed Matrix:")
print(output_matrix)



Input Matrix:
[[ 0.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]
 [10. 11. 12. 13. 14.]
 [15. 16. 17. 18. 19.]
 [20. 21. 22. 23. 24.]
 [25. 26. 27. 28. 29.]
 [30. 31. 32. 33. 34.]
 [35. 36. 37. 38. 39.]]
Transposed Matrix:
[[ 0.  8. 16. 24. 32.  0.  0.  0.]
 [ 1.  9. 17. 25. 33.  0.  0.  0.]
 [ 2. 10. 18. 26. 34.  0.  0.  0.]
 [ 3. 11. 19. 27. 35.  0.  0.  0.]
 [ 4. 12. 20. 28. 36.  0.  0.  0.]]


In [1]:
import pycuda.driver as drv
drv.init()


In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

# CUDA kernels as a string
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
update_weights = mod.get_function("update_weights")
update_bias = mod.get_function("update_bias")

# Parameters
n_row = 4
n_col = 3
epsilon = 0.01

# Initialize input data for weights
W = np.random.rand(n_row, n_col).astype(np.float32)
dW = np.random.rand(n_row, n_col).astype(np.float32)
out_weights = np.zeros_like(W)

# Initialize input data for biases
b = np.random.rand(n_row).astype(np.float32)
db = np.random.rand(n_row).astype(np.float32)
out_bias = np.zeros_like(b)

# Allocate device memory
W_gpu = cuda.mem_alloc(W.nbytes)
dW_gpu = cuda.mem_alloc(dW.nbytes)
out_weights_gpu = cuda.mem_alloc(out_weights.nbytes)

b_gpu = cuda.mem_alloc(b.nbytes)
db_gpu = cuda.mem_alloc(db.nbytes)
out_bias_gpu = cuda.mem_alloc(out_bias.nbytes)

# Copy data to device
cuda.memcpy_htod(W_gpu, W)
cuda.memcpy_htod(dW_gpu, dW)
cuda.memcpy_htod(b_gpu, b)
cuda.memcpy_htod(db_gpu, db)

# Grid and block dimensions
block_size = 32
grid_size_weights = (n_row + block_size - 1) // block_size
grid_size_bias = (n_row + block_size - 1) // block_size

# Launch the update_weights kernel
update_weights(W_gpu, dW_gpu, out_weights_gpu, 
               np.float32(epsilon), np.uint32(n_col), np.uint32(n_row), 
               block=(block_size, 1, 1), grid=(grid_size_weights, 1))

# Launch the update_bias kernel
update_bias(b_gpu, db_gpu, out_bias_gpu, 
            np.float32(epsilon), np.uint32(n_row), 
            block=(block_size, 1, 1), grid=(grid_size_bias, 1))

# Copy results back to host
cuda.memcpy_dtoh(out_weights, out_weights_gpu)
cuda.memcpy_dtoh(out_bias, out_bias_gpu)

# Print results
print("Original weights:\n", W)
print("Weight gradients:\n", dW)
print("Updated weights:\n", out_weights)

print("Original biases:\n", b)
print("Bias gradients:\n", db)
print("Updated biases:\n", out_bias)


In [9]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
add = mod.get_function("add")

# Définir les dimensions des données
n = 10

# Générer des données d'exemple
WX = np.random.rand(n).astype(np.float32)
b = np.random.rand(n).astype(np.float32)
out = np.zeros(n, dtype=np.float32)

# Allouer de la mémoire sur le GPU
WX_gpu = drv.mem_alloc(WX.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
out_gpu = drv.mem_alloc(out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(WX_gpu, WX)
drv.memcpy_htod(b_gpu, b)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n + block_size - 1) // block_size

# Lancer le kernel
add(WX_gpu, b_gpu, out_gpu, np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out, out_gpu)

# Afficher les résultats
print("WX:", WX)
print("b:", b)
print("out:", out)



WX: [0.44234893 0.9506427  0.816055   0.76536775 0.7407418  0.6851173
 0.17310758 0.22000445 0.38997483 0.09695225]
b: [0.56816417 0.6982709  0.27382904 0.12475282 0.8154408  0.2197792
 0.9319359  0.79871154 0.49813652 0.5260788 ]
out: [1.0105131  1.6489136  1.089884   0.89012057 1.5561826  0.9048965
 1.1050435  1.018716   0.88811135 0.6230311 ]


In [19]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
sigmoid = mod.get_function("sigmoid")

# Définir les dimensions des données
n = 10

# Générer des données d'exemple
X = np.random.rand(n).astype(np.float32)
out = np.zeros(n, dtype=np.float32)

# Allouer de la mémoire sur le GPU
X_gpu = drv.mem_alloc(X.nbytes)
out_gpu = drv.mem_alloc(out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(X_gpu, X)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n + block_size - 1) // block_size

# Lancer le kernel
sigmoid(X_gpu, out_gpu, np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out, out_gpu)

# Afficher les résultats
print("X:", X)
print("out:", out)

LogicError: cuModuleGetFunction failed: named symbol not found

In [18]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
exp_scores = mod.get_function("exp_scores")
softmax_div = mod.get_function("softmax_div")
reduce = mod.get_function("reduce")

# Définir les dimensions des données
n = 10

# Générer des données d'exemple
X = np.random.rand(n).astype(np.float32)
exp_scores_out = np.zeros(n, dtype=np.float32)
softmax_out = np.zeros(n, dtype=np.float32)

# Allouer de la mémoire sur le GPU
X_gpu = drv.mem_alloc(X.nbytes)
exp_scores_gpu = drv.mem_alloc(exp_scores_out.nbytes)
softmax_out_gpu = drv.mem_alloc(softmax_out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(X_gpu, X)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n + block_size - 1) // block_size

# Lancer le kernel exp_scores
exp_scores(X_gpu, exp_scores_gpu, np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1))

# Allouer de la mémoire pour la réduction
temp_o = np.zeros(grid_size, dtype=np.float32)
temp_o_gpu = drv.mem_alloc(temp_o.nbytes)
drv.memcpy_dtoh(exp_scores_out,exp_scores_gpu )
# Lancer le kernel reduce
reduce(exp_scores_gpu, temp_o_gpu, np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1), shared=block_size * exp_scores_out.dtype.itemsize)

# Réduire jusqu'à obtenir un seul résultat
while grid_size > 1:
    n = grid_size
    grid_size = (n + block_size - 1) // block_size
    reduce(temp_o_gpu, temp_o_gpu, np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1), shared=block_size * temp_o.dtype.itemsize)

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(temp_o, temp_o_gpu)
sum_exp_scores = temp_o[0]

# Lancer le kernel softmax_div
softmax_div(exp_scores_gpu, np.float32(sum_exp_scores), softmax_out_gpu, np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(softmax_out, softmax_out_gpu)

# Afficher les résultats
print("X:", X)
print("exp_scores:", exp_scores_out)
print("softmax_out:", softmax_out)

LogicError: cuModuleGetFunction failed: named symbol not found

In [2]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
compute_delta2 = mod.get_function("compute_delta2")

# Définir les dimensions des données
n = 10
num_classes = 5

# Générer des données d'exemple
probs = np.random.rand(n, num_classes).astype(np.float32).flatten()
y_true = np.random.randint(0, num_classes, n).astype(np.int32)
out = np.zeros((n, num_classes), dtype=np.float32).flatten()

# Allouer de la mémoire sur le GPU
probs_gpu = drv.mem_alloc(probs.nbytes)
y_true_gpu = drv.mem_alloc(y_true.nbytes)
out_gpu = drv.mem_alloc(out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(probs_gpu, probs)
drv.memcpy_htod(y_true_gpu, y_true)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n + block_size - 1) // block_size

# Lancer le kernel
compute_delta2(probs_gpu, y_true_gpu, out_gpu, np.int32(num_classes), np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out, out_gpu)

# Afficher les résultats
print("probs:", probs.reshape(n, num_classes))
print("y_true:", y_true)
print("out:", out.reshape(n, num_classes))

probs: [[0.99579984 0.5844601  0.33910036 0.49083814 0.04340772]
 [0.58158123 0.03585429 0.240528   0.6326014  0.11575428]
 [0.49828082 0.6637241  0.93154335 0.36431745 0.38215813]
 [0.375357   0.35834676 0.72663724 0.08556011 0.49493223]
 [0.13543361 0.9085487  0.7289106  0.7970793  0.82219565]
 [0.32560155 0.8896485  0.9166291  0.33301947 0.16918373]
 [0.41824257 0.56029624 0.9962488  0.856241   0.6235782 ]
 [0.05035216 0.5427813  0.21113048 0.30545023 0.12998605]
 [0.0427946  0.06538843 0.15667163 0.78847784 0.96960974]
 [0.72211444 0.12521063 0.03835817 0.12172013 0.04630733]]
y_true: [3 2 2 3 3 4 2 3 0 0]
out: [[ 0.99579984  0.5844601   0.33910036 -0.50916183  0.04340772]
 [ 0.58158123  0.03585429 -0.759472    0.6326014   0.11575428]
 [ 0.49828082  0.6637241  -0.06845665  0.36431745  0.38215813]
 [ 0.375357    0.35834676  0.72663724 -0.9144399   0.49493223]
 [ 0.13543361  0.9085487   0.7289106  -0.20292068  0.82219565]
 [ 0.32560155  0.8896485   0.9166291   0.33301947 -0.83081627]

In [17]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
transpose = mod.get_function("transpose")

# Définir les dimensions des données
nx = 4
ny = 3

# Générer des données d'exemple
in_data = np.random.rand(nx, ny).astype(np.float32).flatten()
out_data = np.zeros((ny, nx), dtype=np.float32).flatten()

# Allouer de la mémoire sur le GPU
in_gpu = drv.mem_alloc(in_data.nbytes)
out_gpu = drv.mem_alloc(out_data.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(in_gpu, in_data)

# Définir la taille des blocs et des grilles
block_size = (16, 16, 1)
grid_size = ((nx + block_size[0] - 1) // block_size[0], (ny + block_size[1] - 1) // block_size[1])

# Lancer le kernel
transpose(in_gpu, out_gpu, np.int32(nx), np.int32(ny), block=block_size, grid=grid_size)

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out_data, out_gpu)

# Afficher les résultats
print("Input matrix:")
print(in_data.reshape(nx, ny))
print("Transposed matrix:")
print(out_data.reshape(ny, nx))



  mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])


LogicError: cuModuleGetFunction failed: named symbol not found

In [None]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
compute_db2 = mod.get_function("compute_db")

# Définir les dimensions des données
n_row = 10
n_col = 5

# Générer des données d'exemple
delta2 = np.random.rand(n_row, n_col).astype(np.float32).flatten()
out = np.zeros(n_row, dtype=np.float32)

# Allouer de la mémoire sur le GPU
delta2_gpu = drv.mem_alloc(delta2.nbytes)
out_gpu = drv.mem_alloc(out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(delta2_gpu, delta2)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n_row + block_size - 1) // block_size

# Lancer le kernel
compute_db2(delta2_gpu, out_gpu, np.int32(n_col), np.int32(n_row), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out, out_gpu)

# Afficher les résultats
print("delta2:", delta2.reshape(n_row, n_col))
print("out:", out)

delta2: [[9.6142030e-01 3.0700335e-01 8.9189118e-01 4.1769958e-01 8.9025557e-01]
 [4.5738357e-01 4.4079411e-01 6.6005242e-01 3.0428696e-01 7.6815319e-01]
 [3.7232322e-01 8.7078130e-01 3.5921535e-01 8.0106628e-01 6.6981053e-01]
 [3.1943254e-02 9.3578267e-01 6.9587469e-01 7.1866101e-01 4.1000751e-01]
 [1.0383261e-01 5.5926764e-01 4.8642412e-01 5.7543546e-01 7.5620651e-01]
 [3.1661743e-01 6.2468380e-01 8.2727581e-01 8.9099240e-01 8.4367818e-01]
 [4.6844202e-01 9.9610591e-01 4.8065805e-01 8.5665178e-01 6.9197929e-01]
 [7.5416809e-01 5.6775284e-01 6.2161505e-01 6.1623162e-01 5.4685020e-01]
 [5.8293217e-01 4.1602081e-01 3.6674398e-01 6.7336714e-01 1.7571448e-04]
 [8.1068653e-01 7.4544173e-01 6.8225920e-01 9.1511422e-01 6.9846046e-01]]
out: [3.4682698 2.6306703 3.0731966 2.7922692 2.4811664 3.5032477 3.4938369
 3.1066177 2.0392396 3.851962 ]


In [9]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
compute_delta1 = mod.get_function("compute_delta1")

# Définir les dimensions des données
n_row = 10
n_col = 5

# Générer des données d'exemple
delta1 = np.random.rand(n_row, n_col).astype(np.float32).flatten()
z1 = np.random.rand(n_row, n_col).astype(np.float32).flatten()
out = np.zeros((n_row, n_col), dtype=np.float32).flatten()

# Allouer de la mémoire sur le GPU
delta1_gpu = drv.mem_alloc(delta1.nbytes)
z1_gpu = drv.mem_alloc(z1.nbytes)
out_gpu = drv.mem_alloc(out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(delta1_gpu, delta1)
drv.memcpy_htod(z1_gpu, z1)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n_row + block_size - 1) // block_size

# Lancer le kernel
compute_delta1(delta1_gpu, z1_gpu, out_gpu, np.int32(n_col), np.int32(n_row), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out, out_gpu)

# Afficher les résultats
print("delta1:", delta1.reshape(n_row, n_col))
print("z1:", z1.reshape(n_row, n_col))
print("out:", out.reshape(n_row, n_col))

delta1: [[0.7229435  0.7954493  0.8658877  0.4679509  0.11311023]
 [0.06317025 0.54869056 0.38638923 0.02145014 0.17736492]
 [0.0269344  0.2003111  0.70101446 0.8266686  0.8926484 ]
 [0.1969392  0.7548375  0.36957484 0.50805146 0.82041967]
 [0.36999398 0.87921983 0.83923465 0.8131322  0.82500696]
 [0.789612   0.7683042  0.37008485 0.858343   0.58363235]
 [0.29557848 0.6855353  0.08448109 0.17226145 0.07810096]
 [0.75133127 0.92027587 0.8284486  0.53703386 0.27873248]
 [0.24790819 0.202245   0.43571332 0.567349   0.42123634]
 [0.50979376 0.99081457 0.17006712 0.8927718  0.26770726]]
z1: [[0.18454328 0.08837061 0.74219215 0.1754119  0.11132077]
 [0.12701088 0.3263054  0.83915436 0.7290275  0.12473806]
 [0.70479375 0.6269907  0.59244823 0.99271196 0.83478093]
 [0.7525487  0.3677572  0.27071783 0.01353576 0.71301556]
 [0.4139638  0.31022763 0.47929388 0.556638   0.7373239 ]
 [0.6445809  0.8070839  0.17222428 0.19873093 0.25994596]
 [0.82396036 0.4758171  0.08085154 0.6257178  0.77216405]
 

In [11]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
update_weights = mod.get_function("update_weights")

# Définir les dimensions des données
n_row = 10
n_col = 5
epsilon = 0.01

# Générer des données d'exemple
W = np.random.rand(n_row, n_col).astype(np.float32).flatten()
dW = np.random.rand(n_row, n_col).astype(np.float32).flatten()
out = np.zeros((n_row, n_col), dtype=np.float32).flatten()

# Allouer de la mémoire sur le GPU
W_gpu = drv.mem_alloc(W.nbytes)
dW_gpu = drv.mem_alloc(dW.nbytes)
out_gpu = drv.mem_alloc(out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(W_gpu, W)
drv.memcpy_htod(dW_gpu, dW)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n_row + block_size - 1) // block_size

# Lancer le kernel
update_weights(W_gpu, dW_gpu, out_gpu, np.float32(epsilon), np.int32(n_col), np.int32(n_row), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out, out_gpu)

# Afficher les résultats
print("W:", W.reshape(n_row, n_col))
print("dW:", dW.reshape(n_row, n_col))
print("out:", out.reshape(n_row, n_col))

W: [[0.33461428 0.3195711  0.318646   0.6182187  0.98775953]
 [0.9090064  0.31421646 0.49155873 0.07706822 0.7244915 ]
 [0.6314039  0.32546932 0.8983266  0.94506174 0.27454415]
 [0.9591644  0.72356695 0.8878879  0.83218116 0.61372954]
 [0.7061721  0.7718666  0.23344158 0.030182   0.3022792 ]
 [0.2440078  0.03983122 0.02183884 0.8521558  0.6460792 ]
 [0.27811128 0.06740171 0.47656724 0.90238565 0.5000209 ]
 [0.89489204 0.32485354 0.7970515  0.89360714 0.41858983]
 [0.70800966 0.7913499  0.22945935 0.9321512  0.19601052]
 [0.8166432  0.40817446 0.10786799 0.65642625 0.23842801]]
dW: [[0.706791   0.4288014  0.4621686  0.5487347  0.76296836]
 [0.6571633  0.55811816 0.5101929  0.04239516 0.15132119]
 [0.45387363 0.8848482  0.7748484  0.66118443 0.15155412]
 [0.6620164  0.89205045 0.575946   0.32561484 0.63100654]
 [0.59820104 0.7982164  0.31105202 0.09974479 0.535854  ]
 [0.48336852 0.54772586 0.41365483 0.41299403 0.73340094]
 [0.37536603 0.1820588  0.9282085  0.6345236  0.31283966]
 [0.15

In [None]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Charger le module compilé
mod = SourceModule(open("kernel.cu").read(), options=['-std=c++11'])
update_bias = mod.get_function("update_bias")

# Définir les dimensions des données
n = 10
epsilon = 0.01

# Générer des données d'exemple
b = np.random.rand(n).astype(np.float32)
db = np.random.rand(n).astype(np.float32)
out = np.zeros(n, dtype=np.float32)

# Allouer de la mémoire sur le GPU
b_gpu = drv.mem_alloc(b.nbytes)
db_gpu = drv.mem_alloc(db.nbytes)
out_gpu = drv.mem_alloc(out.nbytes)

# Copier les données sur le GPU
drv.memcpy_htod(b_gpu, b)
drv.memcpy_htod(db_gpu, db)

# Définir la taille des blocs et des grilles
block_size = 256
grid_size = (n + block_size - 1) // block_size

# Lancer le kernel
update_bias(b_gpu, db_gpu, out_gpu, np.float32(epsilon), np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copier les résultats du GPU vers l'hôte
drv.memcpy_dtoh(out, out_gpu)

# Afficher les résultats
print("b:", b)
print("db:", db)
print("out:", out)

In [14]:
import torch
torch.cuda.empty_cache()

In [16]:
import pycuda.autoinit
import pycuda.driver as drv

# Vérifier les ressources disponibles sur le GPU
device = drv.Device(0)
print("Nom du GPU:", device.name())
print("Nombre de multiprocesseurs:", device.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT))
print("Nombre de threads par bloc:", device.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK))
print("Mémoire partagée par bloc:", device.get_attribute(drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK))


Nom du GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU
Nombre de multiprocesseurs: 20
Nombre de threads par bloc: 1024
Mémoire partagée par bloc: 49152
