In [1]:
!source /opt/xilinx/xrt/setup.sh

XILINX_XRT        : /opt/xilinx/xrt
PATH              : /opt/xilinx/xrt/bin:/opt/xilinx/xrt/bin:/home/ubuntu/anaconda3/envs/hls4ml_env/bin:/home/ubuntu/anaconda3/condabin:/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
LD_LIBRARY_PATH   : /opt/xilinx/xrt/lib:/opt/xilinx/xrt/lib:
PYTHONPATH        : /opt/xilinx/xrt/python:/opt/xilinx/xrt/python:


In [2]:
!xbutil examine -d0000:08:00.1 -r thermal


------------------------------------------------------
1/1 [0000:08:00.1] : xilinx_u50_gen3x16_xdma_201920_3
------------------------------------------------------
Thermals
  PCB Top Front          : 40 C
  PCB Top Rear           : 42 C
  FPGA                   : 50 C
  FPGA HBM               : 45 C



In [3]:
import pynq
import numpy as np

from pynq import allocate
from pynq import Overlay

from matplotlib import pyplot as plt
from datetime import datetime



In [4]:
class NeuralNetworkOverlay(Overlay):
    def __init__(self, xclbin_name, dtbo=None, download=True, ignore_version=False, device=None):
        
        super().__init__(xclbin_name, dtbo=dtbo, download=download, ignore_version=ignore_version, device=device)
        
    def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None):
        input_buffer  = allocate( shape=X_shape, dtype=dtype, target=trg_in )
        output_buffer = allocate( shape=y_shape, dtype=dtype, target=trg_out)
        return input_buffer, output_buffer
    
    def predict(self, X, y_shape, input_buffer, output_buffer, dtype=np.float32, debug=None, profile=False, encode=None, decode=None):
        """
        Obtain the predictions of the NN implemented in the FPGA.
        Parameters:
        - X : the input vector. Should be numpy ndarray.
        - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
                    for sizing the output vector shape.
        - dtype : the data type of the elements of the input/output vectors. 
                  Note: it should be set depending on the interface of the accelerator; if it uses 'float' 
                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. 
                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` 
                  doc for more info).
                  In this case the encoding/decoding has to be computed by the PS. For example for 
                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 
                  'float' -> 'ap_fixed<16,6>':
                  ```
                    def encode(xi):
                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
                    def decode(yi):
                        return yi * 2**-10
                    encode_v = np.vectorize(encode) # to apply them element-wise
                    decode_v = np.vectorize(decode)
                  ```
        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
        - encode/decode: function pointers. See `dtype` section for more information.
        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
                  the namesake parameter.
        """
        if profile:
            timea = datetime.now()
        if encode is not None:
            X = encode(X)
        in_size  = np.prod(X.shape)
        out_size = np.prod(y_shape)
        input_buffer[:] = X 
        input_buffer.sync_to_device()
        if debug:
            print("Send OK")
        self.krnl_rtl_1.call(input_buffer, output_buffer, in_size, out_size)
        if debug:
            print("Kernel call OK")
        output_buffer.sync_from_device()
        if debug:
            print("Recieve OK")
        result = output_buffer.copy()
        if profile:
            timeb = datetime.now()
            dts, rate = self._print_dt(timea, timeb, len(X))
            input_buffer.flush()
            output_buffer.flush()
            del input_buffer
            del output_buffer
            self.free()
            return result, dts, rate
        input_buffer.flush()
        output_buffer.flush()
        del input_buffer
        del output_buffer
        self.free()
        return result

    def _print_dt(self, timea, timeb, N):
        dt      = (timeb - timea)
        dts = dt.seconds + dt.microseconds * 10**-6
        rate = N / dts
        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
        print("Or {} us / inferences".format(1/rate*1e6))
        return dts, rate


def print_img(matrix, title="Label"):
    plt.imshow(matrix, cmap='gray')
    plt.title(title)
    plt.show()

In [5]:
ol = NeuralNetworkOverlay(xclbin_name="Dense_kernel.xclbin")

In [None]:
ol.krnl_rtl_1?

In [None]:
ol.krnl_rtl_1.register_map

In [None]:
ol.krnl_rtl_1.signature

In [None]:
X_test = np.load("../NN_train/MNIST_Test/Data/X_test.npy")
X_test = np.asarray(X_test, dtype = np.float32)

In [None]:
i_buff, o_buff = ol.allocate_mem(X_shape=X_test.shape, y_shape=(X_test.shape[0],10), dtype=np.float32, trg_in=ol.HBM0, trg_out=ol.HBM1)

In [None]:
y, _, rate = ol.predict(X=X_test, y_shape=(X_test.shape[0],10), input_buffer=i_buff, output_buffer=o_buff, dtype=np.float32, profile=True, debug=False)

In [None]:
np.save("../NN_train/MNIST_Test/Data/y_alveo_Dense.npy", y)

In [None]:
i_buff, o_buff = ol.allocate_mem(X_shape=X_test.shape, y_shape=(X_test.shape[0],10), dtype=np.float32, trg_in=ol.HBM0, trg_out=ol.HBM1)

In [None]:
N_it = 100
rate_v  = []
for i in range(N_it):
    N = int(((i+1)*X_test.shape[0]/N_it))
    in_size = np.linspace(X_test.shape[0]/N_it, X_test.shape[0], N_it, dtype=np.int32)
    i_buff[:N] = X_test[:N]
    y, _, rate = ol.predict(X=X_test[:N], y_shape=(N,10), input_buffer=i_buff[:N], output_buffer=o_buff[:N], dtype=np.float32, profile=True, debug=False)
    rate_v.append(rate)

In [None]:
plt.plot(in_size, rate_v)

In [None]:
for i in range(10):
    img_title = ('Predicted label: %d' % np.argmax(np.array(y[i])))
    print_img(X_test[i], img_title)

In [None]:
y_alveo = y

In [None]:
%matplotlib inline
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import load_model
import plotting

#model_ref = load_model(MODEL_PATH + 'Keras_only/KERAS_check_best_model.h5')
#with tf.device('/cpu:0'):
#    y_ref = model_ref.predict(X_test)

y_test = np.load("../NN_train/MNIST_Test/Data/y_test.npy")
y_test.astype(np.float32)
y_hls = np.load("../NN_train/MNIST_Test/Data/y_hls_Dense.npy")
y_hls.astype(np.float32)

#print("Accuracy baseline:  {}".format(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_ref, axis=1))))
print("Accuracy hls4ml: {}".format(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_hls, axis=1))))
print("Accuracy FPGA: {}".format(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_alveo, axis=1))))

fig, ax = plt.subplots(figsize=(9, 9))
#_ = plotting.makeRoc(y_test, y_ref, classes)
#plt.gca().set_prop_cycle(None) # reset the colors
_ = plotting.makeRoc(y_test, y_hls, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], linestyle='--')
plt.gca().set_prop_cycle(None) # reset the colors
_ = plotting.makeRoc(y_test, y_alveo   , ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], linestyle=':')

from matplotlib.lines import Line2D
lines = [#Line2D([0], [0], ls='-'),
         Line2D([0], [0], ls='--'),
         Line2D([0], [0], ls=':')]
from matplotlib.legend import Legend
leg = Legend(ax, lines, labels=['hls4ml', 'alveo'],
            loc='lower right', frameon=False)
ax.add_artist(leg)

## Power measures

In [None]:
from pynq import Device

sensors = Device.active_device.sensors
sensors

In [None]:
from pynq.pmbus import DataRecorder

recorder = DataRecorder(sensors["12v_aux"].power,
                        sensors["12v_pex"].power,
                        sensors["vccint"].power)

In [None]:
import pandas as pd

f = recorder.frame

In [None]:
recorder.record(0.1)

In [None]:
f.head()

In [None]:
import plotly.graph_objs as go

layout = {
    'xaxis': {
        'title': 'Time (s)'
    },
    'yaxis': {
        'title': 'Power (W)',
        'rangemode': 'tozero',
        'autorange': True
    }
}

plot = go.FigureWidget(layout=layout)
plot

In [None]:
def update_data(frame, start, end, plot):
    ranged = frame[start:end]
    average_ranged = frame[start-pd.tseries.offsets.Second(5):end]
    rolling = (average_ranged['12v_aux_power'] + average_ranged['12v_pex_power']).rolling(
        pd.tseries.offsets.Second(5)
    ).mean()[ranged.index]
    powers = pd.DataFrame(index=ranged.index)
    powers['board_power'] = ranged['12v_aux_power'] + ranged['12v_pex_power']
    powers['rolling'] = rolling
    data = [
        go.Scatter(x=powers.index, y=powers['board_power'], name="Board Power"),
        go.Scatter(x=powers.index, y=powers['rolling'], name="5 Second Avg")
    ]
    plot.update(data=data)


In [None]:
import threading
import time

do_update = True

def thread_func():
    while do_update:
        now = pd.Timestamp.fromtimestamp(time.time())
        past = now - pd.tseries.offsets.Second(60)
        update_data(recorder.frame, past, now, plot)
        time.sleep(0.5)

from threading import Thread
t = Thread(target=thread_func)
t.start()

In [None]:
do_update = False
t.join()
recorder.stop()

In [1]:
import pynq
import numpy as np

from pynq import allocate
from pynq import Overlay



In [2]:
ol = Overlay("LED_kernel.xclbin")

In [3]:
N = 3970000
input_buffer  = allocate( shape=int(N), dtype=np.uint64 )
output_buffer = allocate( shape=int(N), dtype=np.uint64 )

In [6]:
fifo = ol.krnl_rtl_1

In [7]:
input_buffer[:] = np.random.randint(low=0, high=2**64-1, size=int(N), dtype=np.uint64)

In [8]:
%%timeit
input_buffer.sync_to_device()

fifo.call(input_buffer,output_buffer,int(N/2),int(N/2),0)

output_buffer.sync_from_device()

48.5 ms ± 17.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
output_buffer

In [None]:
input_buffer

In [None]:
fifo.register_map

In [None]:
del input_buffer
del output_buffer
ol.free()

In [None]:
794*32/3*2

In [9]:
((3970000*16)/(49e-3))*1e-6

1296.3265306122448

In [7]:
input_buffer.flush()
output_buffer.flush()
ol.free

NameError: name 'input_buffer' is not defined

In [8]:
del input_buffer
del output_buffer

NameError: name 'input_buffer' is not defined

In [9]:
ol.free()