In [2]:
from pynq import Overlay

ol = Overlay('/home/xilinx/jupyter_notebooks/xilinx/pynq/overlays/bgemm_my/bgemm.bit')
# ol?

dma_a, dma_b, dma_c = ol.dma_a, ol.dma_b, ol.dma_c

bgemm_ip = ol.bgemm_0

from pynq import allocate
import numpy as np
N = 1024

mat_A = np.random.randint(1<<8, size=(N, N), dtype=np.uint32)
mat_B = np.random.randint(1<<8, size=(N, N), dtype=np.uint32)
mat_C = np.zeros((N, N), dtype=np.uint32)


iB = 16
jB = 16
kB = 16
in_a_buff = allocate(shape=(N//jB, N//kB, iB, kB), dtype=np.uint8, cacheable=False)
in_b_buff = allocate(shape=(N//jB, N//kB, jB, kB), dtype=np.uint8, cacheable=False)
out_buff = allocate(shape=(N//jB, iB, jB), dtype=np.uint32, cacheable=False)

#transfer_B = np.asarray(np.split(np.asarray(np.split(mat_B.T, N//kB, axis=1)), N//jB, axis=1))
mat_B_ = np.asarray(np.split(np.asarray(np.split(np.transpose(mat_B, (1, 0)), N//kB, axis=1)), N//jB, axis=1))

CTRL_REG = 0x00
AP_START = (1<<0) # bit 0
AUTO_RESTART = (1<<7) # bit 7

def run_kernel():
    dma_a.sendchannel.transfer(in_a_buff)
    dma_b.sendchannel.transfer(in_b_buff)
    dma_c.recvchannel.transfer(out_buff)
    bgemm_ip.write(CTRL_REG, (AP_START | AUTO_RESTART))
    dma_a.sendchannel.wait()
    dma_b.sendchannel.wait()
    dma_c.recvchannel.wait()

def matmul_naive():
    for i in range(N//iB):
        print('.', end='')
        in_a_buff[:] = np.asarray([np.split(mat_A[i*iB:(i+1)*iB,:], N//kB, axis=1)] * (N//jB))
        in_b_buff[:] = mat_B_
        run_kernel()
        mat_C[i*iB:(i+1)*iB, :] = np.reshape(np.transpose(out_buff, (1, 0, 2)), (iB, N))

#         print(f"({i}){mat_C[i*iB, 0]}", end=' ')

import time
def timeit(fn):
    start=time.time()
    fn()
    return time.time() - start

fpga_time = timeit(matmul_naive)
print(f'\nFPGA time: {fpga_time}')

ans_C = 0
def wrap_cpu():
    global ans_C
    ans_C = np.matmul(mat_A, mat_B)
cpu_time = timeit(wrap_cpu)
print(f'CPU time: {cpu_time}')

il, jl = np.random.randint(N-13), np.random.randint(N-17)
ir, jr = il+13, jl+17
print(f"Sample: ({il}:{ir}, {jl}:{jr})")
print(f'Validate: {np.array_equal(ans_C[il:ir, jl:jr], mat_C[il:ir, jl:jr])}')
print(f"Speedup: {cpu_time / fpga_time}")
print(f"Score: {min(100, int(cpu_time // fpga_time * 10))}")

................................................................
FPGA time: 14.785401105880737
CPU time: 180.1658923625946
Sample: (218:231, 517:534)
Validate: True
Speedup: 12.185390918541636
Score: 100
