In [1]:
import time
from pynq import Overlay
import pynq.lib.dma
from pynq import allocate
import numpy as np
from pynq import MMIO
from PIL import Image

In [2]:
ol = Overlay('/home/xilinx/pynq/overlays/fcl_accel/fcl_accel.bit')

In [3]:
# Inititalize hardware
dma0 = ol.fclAccel.axi_dma_0
dma1 = ol.fclAccel.axi_dma_1
fcl_ip = MMIO(0x43c00000, 0x10000)
dma0_len = 0x18
dma1_len = 0x10

In [4]:
# Open sample image file (must be in same directory)
f = "flower128.jpg"
img = Image.open(f)

In [5]:
# Hardware parameters
w = img.size[0]
h = img.size[1]
k = 3
winsize = k*k
channels = 3
filters = 6
stride = 1
pads = 1

dim0 = w*h
dim1 = 3 + (k*k) + dim0

In [6]:
# Convert image into float values
data = np.array(img).transpose((2, 0, 1))
inp = []
for c in range(channels):
    inp.append(data[c].flatten())
    inp[c] = inp[c]/255.0


In [7]:
# Filters
kernels = []
kernels.append(np.array([[-0.5, -0.5, -0.5], [0, 0, 0], [0.5, 0.5, 0.5]]))
kernels.append(np.array([[1/4, -1/4, 1/4], [2/4, -2/4, 2/4], [3/4, -3/4, 3/4]]))
kernels.append(np.array([[1/2, 1/4, 0], [1/4, 0, -1/4], [0, -1/4, -1/2]]))
kernels.append(np.array([[-1/2, -1/2, -1/2], [3/4, 3/4, 3/4], [-1/2, -1/2, -1/2]]))
kernels.append(np.array([[1/8, 1/8, 1/8], [1/8, 1/8, 1/8], [1/8, 1/8, 1/8]]))
kernels.append(np.array([[0.0, -1/4, 0.0], [-1/4, 1, -1/4], [0.0, -1/4, 0.0]]))

In [8]:
# Fill kernel arrays
ker = []
for n in range(filters):
    ker.append(kernels[n].flatten())
#print(ker)

In [9]:
# Run and time HW implementation
out = []

buf1 = allocate(shape=(dim1,), dtype='float32')

for n in range(filters):
    out.append(allocate(shape=(dim0,), dtype='float32'))

t_start = time.time()
t_fill_total = 0

for n in range(filters):
        
    for c in range(channels):
        
        t_fill = time.time()
        
        buf1[:] = 0
        
        buf1[0] = k
        buf1[1] = h
        buf1[2] = w
        
        #print(ker[n])
        buf1[3:(3+winsize)] = ker[n]
        #print(buf1[3:(3+winsize)])
        buf1[(3+winsize):] = inp[c]
        
        t_fill_total += (time.time() - t_fill)
        
        fcl_ip.write(dma1_len, dim1)
        fcl_ip.write(dma0_len, dim0)

        dma1.sendchannel.transfer(buf1)
        dma0.sendchannel.transfer(out[n])
        dma0.recvchannel.transfer(out[n])
        
        dma1.sendchannel.wait()
        dma0.sendchannel.wait()
        dma0.recvchannel.wait()
        
    print("Filter " + str(n) + " completed")
        
t_stop = time.time()

buf1.close()

print('Hardware execution time:', t_stop-t_start, "seconds.")
print(t_fill_total, "seconds were spent filling the buffers.")

Filter 0 completed
Filter 1 completed
Filter 2 completed
Filter 3 completed
Filter 4 completed
Filter 5 completed
Hardware execution time: 1.3472671508789062 seconds.
0.06699252128601074 seconds were spent filling the buffers.


In [19]:
# Saves image for each output map
for n in range(filters):
    ar = np.reshape(out[n], (128,128))
    im = Image.fromarray(ar*255)
    im = im.convert("L")
    im.show()
    im.save("flower_outputs/hw_out{}.jpg".format(n))

In [11]:
# Software implementation
# Input parameters
w_in = img.size[0]      # Input image width
h_in = img.size[1]      # Input image height
c_in = channels                # Input image channels (RGB)
c_out = filters               # Output feature map channels
k = 3                   # Kernel size
stride = 1
pads = 1

w = np.zeros((c_out, c_in, k, k))   # Weights
b = np.zeros((c_out))               # Biases

# Output dimensions
w_out = (w_in + 2*pads - k) // stride + 1  # Output feature map width
h_out = (h_in + 2*pads - k) // stride + 1  # Output feature map height

In [12]:
# Padding image
img2 = img.copy()
img2 = np.pad(img2, (pads, pads), 'constant')

# Converting input image to numpy arrays
data2 = np.array(img2).transpose((2, 0, pads))
#data2 = np.array(img2)
data2 = np.expand_dims(data2, axis=0)
data2 = data2/255.0

In [13]:
# Assigning weights and biases for software
for wi in range(c_out):
    for wj in range(c_in):
        if(wi < len(kernels)):
            w[wi][wj] = kernels[wi]
        else:
            # Use random weights if not enough filters were defined
            print("Assigning random weight to", wi, wj)
            w[wi][wj] = np.random.normal(0, 1)

In [14]:
# Define convolution algorithm
def conv(data):
    #print("w_out:", w_out, "h_out:", h_out, "c_out:", c_out, "c_in:", c_in)
    output_map = np.zeros((1, c_out, w_out, h_out))
    for oi in range(w_out):
        for oj in range(h_out):
            for co in range(c_out):
                total = 0
                for ci in range(c_in):
                    kt = 0
                    for ki in range(k):
                        for kj in range(k):
                            weight = w[co, ci, ki, kj]
                            y = ki+oi*stride
                            x = kj+oj*stride
                            val = data[0, ci, y, x]
                            kt += weight * val
                    total += kt
                output_map[0, co, oi, oj] = total
    return output_map

In [15]:
# Timing of software implementation
start = time.time()
output = conv(data2)
end = time.time()
print("Software execution time:", end-start, "seconds.")

Software execution time: 42.50225281715393 seconds.


In [18]:
# Saves image for each output map
for n in range(filters):
    ar = np.reshape(output[0][n], (128, 128))
    im = Image.fromarray(ar*255)
    im = im.convert("L")
    im.show()
    im.save("flower_outputs/sw_out{}.jpg".format(n))

In [17]:
# Time comparison
print("Hardware was", (end-start)/(t_stop-t_start), "times faster than software.")

Hardware was 31.547011882110436 times faster than software.
