# 一、初始化
## 載入 PYNQ 庫

In [1]:
from pynq import Overlay, GPIO, Register, Xlnk, PL
import numpy as np
import time

## 配置 FPGA

In [2]:
#overlay = Overlay('MLP4MNIST.bit')
overlay = Overlay('MLP.bit')

## 定義 ResetPin & CtrlReg

In [3]:
RESET_VALUE = 0
NRESET_VALUE = 1

MYBNN_AP_CTRL_OFF = 0x00
MYBNN_AP_CTRL_START_IDX = 0
MYBNN_AP_CTRL_DONE_IDX  = 1
MYBNN_AP_CTRL_IDLE_IDX  = 2
MYBNN_AP_CTRL_READY_IDX = 3
MYBNN_COEFF_OFFS  = [0x1c, 0x28, 0x34, 0x40, 0x4c, 0x58, 0x64, 0x70, 0x7c]
MYBNN_LENGTH_OFF  = 0x10

overlay.__resetPin = GPIO(GPIO.get_gpio_pin(0), "out")
overlay.__ap_ctrl = Register(overlay.BlackBoxJam.mmio.base_addr, 32)
overlay.xlnk = Xlnk()

# 二、使用 ARM-FPGA 異構運算
## 處理 MNIST 標籤

In [4]:
labels = []
with open("/home/xilinx/jupyter_notebooks/MLP4MNIST_Demo/t10k-labels-idx1-ubyte","rb") as ubyte_labels:
    magicNum = int.from_bytes(ubyte_labels.read(4), byteorder="big")
    countLbl = int.from_bytes(ubyte_labels.read(4), byteorder="big")
    for idx in range(countLbl):
        labels.append(int.from_bytes(ubyte_labels.read(1), byteorder="big"))
    ubyte_labels.close()

## 處理 MNIST 圖片

In [5]:
numpic = 10000
imgArr = []
X_train = []
for i in range(28):
    X_train.append([])
    for j in range(28):
        X_train[i].append([])
        
with open("/home/xilinx/jupyter_notebooks/MLP4MNIST_Demo/t10k-images-idx3-ubyte","rb") as img_file:
    img_file.read(16)
    for jdx in range(0,numpic):
    
        for i in range(28):
            for j in range(28):
                X_train[i][j] = int.from_bytes(img_file.read(1),byteorder='big')
                
        imgVet = []
        for i in range(28):
            for j in range(28):
                if (X_train[i][j]/255)*(1-(-1))+(-1) >= 0:
                    imgVet.append(1)
                else:
                    imgVet.append(0)
        for i in range(48):
            imgVet.append(0)
                
        for j in range(13):
            imgNum = ""
            for i in range(64):
                if imgVet[64*j+i] == 1:
                    imgNum += "1"
                else:
                    imgNum += "0"
            imgArr.append(int(imgNum[::-1],2))
            
    img_file.close()

In [40]:
signal = imgArr
coeffs = [1, 0, 0, 0, 0, 0, 0, 0, 0]
print(len(signal))

130000


## 放開 Reset 

In [41]:
overlay.__resetPin.write(NRESET_VALUE)

## 載入參數

In [42]:
for (offset, coeff) in zip(MYBNN_COEFF_OFFS, coeffs):
    overlay.BlackBoxJam.write(offset, coeff)
    
overlay.BlackBoxJam.write(MYBNN_LENGTH_OFF, numpic)

## 申請 DMA 記憶體空間

In [43]:
cmabuf_src = overlay.xlnk.cma_array([len(signal)], np.uint64)
cmabuf_dest = overlay.xlnk.cma_array(numpic+1, np.uint64)

## 準備輸入資料

In [44]:
for i in range(len(signal)):
    cmabuf_src[i] = signal[i]

## 初始化 DMA

In [45]:
overlay.hlsDmaEngine.sendchannel.transfer(cmabuf_src)
overlay.hlsDmaEngine.recvchannel.transfer(cmabuf_dest)

## 啟動計算核心並等待 DMA 中斷

In [46]:
overlay.__ap_ctrl[MYBNN_AP_CTRL_START_IDX] = 1

t0 = time.clock()

overlay.hlsDmaEngine.sendchannel.wait()
overlay.hlsDmaEngine.recvchannel.wait()

overlay.__ap_ctrl[MYBNN_AP_CTRL_START_IDX] = 0

t1 = time.clock()

## 鎖住 Reset 

In [47]:
overlay.__resetPin.write(RESET_VALUE)

## 取回輸出資料

In [48]:
output = cmabuf_dest.tolist()

## 釋放空間

In [49]:
cmabuf_dest.freebuffer()
cmabuf_src.freebuffer()

# 三、驗證結果

In [50]:
import math
testnum = 0
out = 0

for i in range(0,numpic):
    if(output[i] == 0):
        out = 0
    else:
        out = math.log(output[i],2)
    if(labels[i%10000] != out):
        testnum+=1
        
size = ((64*len(signal))/(8*1024*1024))
print("Accuracy of MNIST:",(1-testnum/numpic)*100,"%")
print("Latency:",(t1-t0)*math.pow(10, 3),"ms")
print("Second per image:",(t1-t0)*math.pow(10, 6)/numpic, "us")
print("FPS:",1/(t1-t0)*numpic)
print("Throughput:",size*(1/(t1-t0))*2, "MB/s")
print("Data size:",size,"MB")

Accuracy of MNIST: 98.42 %
Latency: 99.28899999999885 ms
Second per image: 9.928899999999885 us
FPS: 100716.09140992572
Throughput: 19.978472722305824 MB/s
Data size: 0.9918212890625 MB
