In [1]:
import sys
import os

sys.path.append(os.path.abspath("../common"))

import math
import time
import numpy as np
from PIL import Image
from matplotlib import pyplot
import cv2
from datetime import datetime
import ctypes

import pynq
import dac_sdc
from IPython.display import display
from pynq import Clocks

from concurrent.futures import ThreadPoolExecutor

team_name = 'Seuer_int'
team = dac_sdc.Team(team_name)

In [2]:
overlay = pynq.Overlay(team.get_bitstream_path())
resize_1b = ctypes.cdll.LoadLibrary("./resize_1b.so")
dma = overlay.axi_dma_0
nn_ctrl = overlay.ultra_net_0
Clocks.fclk0_mhz = 300
print(Clocks.fclk0_mhz)
print('got nn accelerator!')

299.997
got nn accelerator!


In [4]:
def load_image_resize(rgb_imgs,buff,bs):
    def process_chunk(start, end):
        for i in range(start, end):
            imgc = rgb_imgs[i].ctypes.data_as(ctypes.c_char_p)
            dataptr = buff[i].ctypes.data_as(ctypes.c_char_p)
            resize_1b.load_image(imgc, dataptr) 
            
    num_threads = 4
    chunk_size = 25
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_chunk, i * chunk_size, (i + 1) * chunk_size)for i in range(num_threads)]
        for future in futures: 
            future.result() 

In [5]:
BATCH_SIZE = 100
BATCH_NUM = 1000 // BATCH_SIZE
IMAGE_RAW_ROW = 360
IMAGE_RAW_COL = 640
IMAGE_ROW = 160
IMAGE_COL = 320
GRID_ROW = 10
GRID_COL = 20
X_SCALE = IMAGE_RAW_COL / IMAGE_COL
Y_SCALE = IMAGE_RAW_ROW / IMAGE_ROW

in_buffer0 = pynq.allocate(shape=(BATCH_SIZE,IMAGE_ROW, IMAGE_COL, 3), dtype=np.uint8)
in_buffer1 = pynq.allocate(shape=(BATCH_SIZE,IMAGE_ROW, IMAGE_COL, 3), dtype=np.uint8)
in_buffers = [in_buffer0, in_buffer1]
out_buffer0 = pynq.allocate(shape=(BATCH_SIZE,GRID_ROW, GRID_COL, 6, 6), dtype=np.int32)
out_buffer1 = pynq.allocate(shape=(BATCH_SIZE,GRID_ROW, GRID_COL, 6, 6), dtype=np.int32)
out_buffers = [out_buffer0, out_buffer1]

def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s

def yolo(out_buffer, batch_n,_, result):
    out_buffer_dataptr=out_buffer.ctypes.data_as(ctypes.c_char_p)
    rst=np.empty( (batch_n,4),dtype=np.int32)
    rst_dataptr=rst.ctypes.data_as(ctypes.c_char_p)
    cfuns.yolo(out_buffer_dataptr,batch_n,rst_dataptr)
    result.extend(rst.tolist())
        
def net_signle_batch(rgb_imgs, result):
    print("Loading image into buffer for DMA transfer")
    for i,(img_path, img) in enumerate(rgb_imgs):
        in_buffer[i,:] = img    
    print("Loading image successfully completed")
    print("ACC Starting")
    nn_ctrl.write(0x0, 0)
    nn_ctrl.write(0x10, in_buffer.shape[0])
    nn_ctrl.write(0x0, 1)
    dma.sendchannel.transfer(in_buffer)
    dma.recvchannel.transfer(out_buffer)
    dma.sendchannel.wait()
    dma.recvchannel.wait()
    print("ACC Ending")
    yolo(out_buffer, BATCH_SIZE, 127 * 15, result)
    
which_buffer = 0
first_batch = True
net_cnt = 0

def net_pingpong(rgb_imgs, result):

    global first_batch
    global which_buffer    
    global net_cnt
    if first_batch == True:
        first_batch = False
        which_buffer = 0
        load_image_resize(rgb_imgs,in_buffers[which_buffer][:],BATCH_SIZE)
        return
    net_cnt += 1
    nn_ctrl.write(0x0, 0)
    nn_ctrl.write(0x10, in_buffers[which_buffer].shape[0])
    nn_ctrl.write(0x0, 1)
    dma.recvchannel.transfer(out_buffers[which_buffer])
    if net_cnt == 1:
        dma.sendchannel.transfer(in_buffers[which_buffer])

    if which_buffer == 0:
        which_buffer = 1
    else:
        which_buffer = 0

    if len(rgb_imgs)!=0:
        load_image_resize(rgb_imgs,in_buffers[which_buffer][:],BATCH_SIZE)

    if net_cnt > 1:
        yolo(out_buffers[which_buffer], BATCH_SIZE, 127 * 15, result)

        
    dma.sendchannel.wait()
    if net_cnt!=BATCH_NUM:
        dma.sendchannel.transfer(in_buffers[which_buffer])
    dma.recvchannel.wait()

    if len(rgb_imgs)==0:
        yolo(out_buffers[(which_buffer + 1) % 2], BATCH_SIZE, 127 * 15, result) 


In [6]:
def my_callback_pingpong(rgb_imgs):
    load_cnt = 0
    img_locations = []
    global first_batch
    global which_buffer    
    global net_cnt
    which_buffer = 0
    first_batch = True
    net_cnt = 0
    while True:
        rgb_img_bs = [img for (_, img) in rgb_imgs[BATCH_SIZE*load_cnt:BATCH_SIZE*(load_cnt+1)]]
        load_cnt += 1
        net_pingpong(rgb_img_bs, img_locations)
        if len(rgb_img_bs)==0:
            break
    return img_locations

rails = pynq.get_rails()
rails_to_monitor = ["1V2", "PSDDR", "INT"]
recorder = pynq.DataRecorder(*[rails[r].power for r in rails_to_monitor])
    
team.run(my_callback_pingpong, debug=False)

Done all batches. Total runtime = 0.6165785789489746 seconds. Total energy = 1.2138890773057938 J.
Savings results to XML...
XML results written successfully.


In [7]:
del in_buffers
del out_buffers

##### 