In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
N_size=20 #dimension of square matrix
### initialize matrix A and B with random floating point values in range 0-10
in_A=np.random.uniform(0, 10, size=(N_size, N_size))
in_B=np.random.uniform(0, 10, size=(N_size, N_size))

## **PS Execution**

In [3]:
### nested loop matrix multiplication
def matrix_multiply(in_A, in_B):
    N = len(in_A)
    out_C = [[0 for _ in range(N)] for _ in range(N)]

    for i in range(N):
        for j in range(N):
            for k in range(N):         
                out_C[i][j] += in_A[i][k] * in_B[k][j]
    return out_C

In [4]:
#### PS matrix multiplication implementation
ps_start=time.time()
y_sw_1=matrix_multiply(in_A, in_B)
ps_end=time.time()

In [5]:
#### Display matrix multiplication PS time in seconds
ps_time_1=ps_end-ps_start
ps_time_1

0.09128499031066895

In [6]:
#### PS matrix multiplication implementation
ps_start=time.time()
y_sw_2=np.matmul(in_A,in_B)
ps_end=time.time()

In [7]:
#### Display matrix multiplication PS time in seconds
ps_time_2=ps_end-ps_start
ps_time_2

0.010701179504394531

## **PL Floating-Point Execution**

In [8]:
from pynq import Overlay 
from pynq import allocate
ol = Overlay("design_20_fp.bit") #download bitstream
### create DMA instances
dma=ol.axi_dma_0
dma_send = dma.sendchannel
dma_recv = dma.recvchannel

In [9]:
ol?

In [10]:
### flatten the input matrices into a vector
in_A_flat=in_A.flatten()
in_B_flat=in_B.flatten()
in_vec=np.concatenate((in_A_flat,in_B_flat))

In [11]:
### create input and output pynq buffers for DMA transfers
input_buffer = allocate(shape=(2*N_size*N_size,), dtype=np.float32)
output_buffer = allocate(shape=(N_size*N_size,), dtype=np.float32)
np.copyto(input_buffer,np.concatenate((in_A_flat,in_B_flat)))

In [12]:
### DMA transfers for PL matrix multiplication implementation
pl_start=time.time()
dma_send.transfer(input_buffer)
dma_recv.transfer(output_buffer)
dma_send.wait()
dma_recv.wait()
pl_end=time.time()

In [13]:
#### Display matrix multiplication PL time in seconds
pl_time=pl_end-pl_start
pl_time

0.006264209747314453

In [14]:
### Acceleration factor w.r.t. nested loops
AF=ps_time_1/pl_time
AF

14.572467077719418

In [15]:
### Acceleration factor w.r.t. numpy
AF=ps_time_2/pl_time
AF

1.7083047880033493

In [16]:
### RMSE calculation between PS and PL
y_sw_flat=y_sw_2.flatten()
y_hw=np.zeros((N_size*N_size,),dtype=np.float32)
np.copyto(y_hw,output_buffer)
rmse=np.sqrt(np.sum(np.square(np.abs(y_sw_flat-y_hw)))/(N_size*N_size))
rmse

3.679584305750965e-05

## **PL Fixed-Point Execution**

In [26]:
from pynq import Overlay 
from pynq import allocate
ol = Overlay("design_20_fx.bit") #download bitstream
### create DMA instances
dma=ol.axi_dma_0
dma_send = dma.sendchannel
dma_recv = dma.recvchannel

In [27]:
ol?

In [28]:
### flatten the input matrices into a vector
in_A_flat=in_A.flatten()
in_B_flat=in_B.flatten()
in_vec=np.concatenate((in_A_flat,in_B_flat))

In [29]:
### create input and output pynq buffers for DMA transfers
input_buffer = allocate(shape=(2*N_size*N_size,), dtype=np.float32)
output_buffer = allocate(shape=(N_size*N_size,), dtype=np.float32)
np.copyto(input_buffer,np.concatenate((in_A_flat,in_B_flat)))

In [30]:
### DMA transfers for PL matrix multiplication implementation
pl_start=time.time()
dma_send.transfer(input_buffer)
dma_recv.transfer(output_buffer)
dma_send.wait()
dma_recv.wait()
pl_end=time.time()

In [31]:
#### Display matrix multiplication PL time in seconds
pl_time=pl_end-pl_start
pl_time

0.0036649703979492188

In [32]:
### Acceleration factor w.r.t nested loops
AF=ps_time_1/pl_time
AF

24.90742909185532

In [33]:
### Acceleration factor w.r.t. numpy 
AF=ps_time_2/pl_time
AF

2.9198542805100183

In [34]:
### RMSE calculation between PS and PL
y_sw_flat=y_sw_2.flatten()
y_hw=np.zeros((N_size*N_size,),dtype=np.float32)
np.copyto(y_hw,output_buffer)
rmse=np.sqrt(np.sum(np.square(np.abs(y_sw_flat-y_hw)))/(N_size*N_size))
rmse

0.0003844368543607791