## Neural Network Hardware Accelerator

An overlay is used to read and write data from a neural network accelerator implemented on the FPGA. 
The model implemented is MLP, with 2 hidden fully connected layers with ReLU activation function.
An Axilite MMIO interface is used to handle reading and writing of data between the CPU and FPGA.

In [1]:
from pynq import Overlay
import struct
import time
import pynq
import pandas as pd
from numpy import exp


In [2]:
# Initializes DataRecorder object to monitor power rails using PMBus
def init_power_recorder():  
    rails = pynq.get_rails()
    RAIL_NAME = 'PSINT_FP'
    return pynq.DataRecorder(rails[RAIL_NAME].power)

In [3]:
# Axilite MMIO interface does not support read/write of floats
def float_to_integer(f):
    return struct.unpack('<I', struct.pack('<f', f))[0]
    
def integer_to_float(i):
    return struct.unpack('f', struct.pack('I', i))[0]

In [4]:
def softmax(vector):
  e = exp(vector)
  return e / e.sum()

In [8]:
# Initialize pynq Overlay to program FPGA using bitstream
overlay = Overlay('MLP11.bit')
accelerate_mlp = overlay.accelerate_MLP_0

In [5]:
# Read in test inputs and labels as lists 
x_test = pd.read_csv('X_test.csv', header=None).values.tolist()
test_length = len(x_test)
input_length = len(x_test[0])

# Get y labels from the txt file
y_test = pd.read_csv('y_test.csv', names=['Activity'], squeeze=True).tolist()

In [6]:
y_test


[7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0,
 7.0

In [12]:
def predict(inputs):   

    # Mapping for MMIO obtained from Vivado HLS

    # AXILiteS
    # 0x000 : Control signals
    #          bit 0  - ap_start (Read/Write/COH)
    #          bit 1  - ap_done (Read/COR)
    #          bit 2  - ap_idle (Read)
    #          bit 3  - ap_ready (Read)
    #          bit 7  - auto_restart (Read/Write)
    #          others - reserved
    # 0x400 ~
    # 0x7ff : Memory 'inputs' (210 * 32b)
    #          Word n : bit [31:0] - inputs[n]
    # 0x800 ~
    # 0x83f : Memory 'outputs' (9 * 32b)
    #          Word n : bit [31:0] - outputs[n]

    INPUT_OFFSET = 0x400
    OUTPUT_OFFSET = 0x800
    CONTROL_OFFSET = 0x000
    AP_START = 1
    AP_DONE = 2
    # Each input is 32 bits = 4 bytes
    NUM_BYTES = 4
    INPUT_LENGTH = 210
    OUTPUT_LENGTH = 9
    
    #write_start = time.time()
    #Write input values to IP
    for i in range(INPUT_LENGTH):
        int_input = float_to_integer(inputs[i])
        accelerate_mlp.write(INPUT_OFFSET + i*NUM_BYTES, int_input)
    #write_end = time.time()
    #print(f"writing: {write_end-write_start}")
    
     # Start executing IP
    accelerate_mlp.write(CONTROL_OFFSET, AP_START)
    
    #execute_start = time.time()
    # Wait for AP_DONE bit to be asserted when outputs are ready
    while accelerate_mlp.read(CONTROL_OFFSET) & AP_DONE == 0:
        continue
    #execute_end = time.time()
    #print(f"execution: {execute_end-execute_start}")
    
    #read_start = time.time()
    # Read output values from IP
    result = []
    for i in range(OUTPUT_LENGTH):
        res = accelerate_mlp.read(OUTPUT_OFFSET + i*NUM_BYTES)
        result.append(integer_to_float(res))
    #print(f'{softmax(result)[0]:.6f} ' + f'{softmax(result)[1]:.6f} ' + f'{softmax(result)[2]:.6f}' )
    #read_end = time.time()
    #print(f"reading: {read_end-read_start}")
    # Return label with highest value
    return result.index(max(result))

In [10]:
# Single prediction test
start_time = time.time()
x = predict(x_test[0])
latency = time.time() - start_time

print(x)
print(f"latency: {latency}")

1
latency: 0.009096622467041016


In [13]:
# Sample the power every 0.1 seconds 
recorder = init_power_recorder()
results = []
with recorder.record(0.1):
    start_time = time.time()
    correct_predictions = 0
    
    
    # Test FPGA model 
    for i in range(test_length):
        res = predict(x_test[i])
        results.append(res)
        #print("prediction: " + str(res))
        #print("actual: " + str(y_test[i]))

        if res == y_test[i]:
            correct_predictions += 1
    
    latency = time.time() - start_time
    print(f"latency: {latency}\n")

    print(f"accuracy: {correct_predictions/test_length * 100}%")

print(f"Average latency: {latency / test_length}\n")

latency: 15.806457996368408

accuracy: 95.60867148415787%
Average latency: 0.008786246801761205



In [21]:
import csv
results
with open('outputs.csv', 'w') as f:
    writer = csv.writer(f)
    for val in results:
        writer.writerow([val])

In [23]:
# Sample the power every 0.1 seconds 
recorder = init_power_recorder()

with recorder.record(0.1):
    # Model execution
    start_time = time.time()
    correct_predictions = 0
    
    # Test FPGA model 
    for i in range(10):
        res = predict(x_test[i])
        if res == y_test[i]:
            correct_predictions += 1
    
    latency = time.time() - start_time
    print(f"latency: {latency}\n")

    print(f"accuracy: {correct_predictions/10 * 100}%")

print(f"Average latency: {latency / 10}\n")

[-4.920405387878418, -73.5132064819336, 15.320442199707031]
[-22.121559143066406, -44.650543212890625, 19.705425262451172]
[-63.601993560791016, -142.70758056640625, 73.55416870117188]
[-19.58596420288086, -30.530820846557617, 15.15947151184082]
[-49.32612991333008, -103.2988052368164, 49.961421966552734]
[-60.70351791381836, -101.67236328125, 54.213993072509766]
[-58.610267639160156, -73.27030944824219, 46.52710723876953]
[-47.36003112792969, -87.63713836669922, 41.87965774536133]
[-41.72501754760742, -51.893089294433594, 35.50263595581055]
[-25.278484344482422, -42.48690414428711, 22.454931259155273]
latency: 0.09514188766479492

accuracy: 100.0%
Average latency: 0.009514188766479493



In [37]:
# View the sampled power data
recorder.frame

Unnamed: 0,Mark,PSINT_FP_power
2021-10-05 07:41:10.436277,0.0,0.75
