# Produto Escalar em CUDA

**Antes de rodar:** vá em `Runtime → Change runtime type → T4 GPU`

In [None]:
!pip install pycuda -q

In [None]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule

THREADS = 256

mod = SourceModule("""
__global__ void dot_kernel(float *A, float *B, float *partial, int N) {
    extern __shared__ float sdata[];

    int tid = threadIdx.x;
    int i   = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < N) ? A[i] * B[i] : 0.0f;
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) sdata[tid] += sdata[tid + s];
        __syncthreads();
    }

    if (tid == 0) partial[blockIdx.x] = sdata[0];
}
""")

dot_kernel = mod.get_function('dot_kernel')
print('Kernel compilado com sucesso!')

In [None]:
def dot_gpu(A, B):
    A = np.array(A, dtype=np.float32)
    B = np.array(B, dtype=np.float32)
    N = len(A)
    blocks = (N + THREADS - 1) // THREADS

    partial = np.zeros(blocks, dtype=np.float32)

    dot_kernel(
        drv.In(A), drv.In(B), drv.Out(partial), np.int32(N),
        block=(THREADS, 1, 1),
        grid=(blocks, 1, 1),
        shared=THREADS * 4
    )

    return float(partial.sum())

def dot_cpu(A, B):
    return float(np.dot(A, B))

def run_test(label, A, B, expected):
    gpu  = dot_gpu(A, B)
    cpu  = dot_cpu(A, B)
    diff = abs(gpu - cpu)
    print(f'\n--- {label} ---')
    print(f'Resultado GPU : {gpu:.4f}')
    print(f'Resultado CPU : {cpu:.4f}')
    print(f'Esperado      : {expected:.4f}')
    print(f'Diferenca     : {diff:.2e} -> {"OK" if diff < 1e-3 else "ERRO"}')

In [None]:
print('Produto Escalar em CUDA')

run_test('Exemplo 1', [1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], 70.0)
run_test('Exemplo 2', [0.5, 1.5, 2.5],       [2.0, 3.0, 4.0],       15.5)