# Handwritten recognition


This kernel runs a Neural Network based on two Fully Connected layers, that classifies a handwritten digit

### Import input image

In [1]:
from pynq import allocate
import struct
import h5py
import numpy as np
from decimal import *

src_float = allocate(shape=(784,), dtype='float32')

f = open("params/image_7.h", 'r')
index=0
data = f.readline()
data_string = str(data)
data_joined = ''.join(data_string)
data_list = data_joined.split(', ')

for item in data_list:
    #print(item + " " + str(index))
    src_float[index]=item
    index+=1

# f.close()

### Load network parameters from h5 file

In [4]:
getcontext().prec = 5
f = h5py.File('params/2fc.h5', 'r')
list(f.keys())

# Fully connected 1 
dset1 = f['dense_3']
fc_1= dset1['dense_3']
#bias
fc_1_bias = fc_1['bias:0']
fc_1_bias[0]
#weights
fc_1_weights=fc_1['kernel:0']
fc_1_weights[0,0]  

# Fully connected 2
dset2 = f['dense_4']
fc_2= dset2['dense_4']
#bias
fc_2_bias = fc_2['bias:0']
fc_2_bias[0]   
#weights
fc_2_weights=fc_2['kernel:0']
fc_2_weights[0,0]

0.26624438

### Fit parameters into python variables

In [5]:
weights_1_float = allocate(shape=(78400,), dtype='float32')
biases_1_float = allocate(shape=(100,), dtype='float32')
weights_2_float = allocate(shape=(1000,), dtype='float32')
biases_2_float = allocate(shape=(10,), dtype='float32')

#FC1            
i=0
j=0
helper=0
for i in range(100):
    for j in range(784):
        weights_1_float[helper]=fc_1_weights[j][i]
        helper+=1
i=0
for i in range(100):
    biases_1_float[i]=fc_1_bias[i]
#FC2
i=0
j=0
helper=0
for i in range(10):
    for j in range(100):
        weights_2_float[helper]=fc_2_weights[j][i]
        helper+=1
i=0
helper=0
for i in range(10):
    biases_2_float[i]=fc_2_bias[i]

## Kernel Code
The OpenCL code of your kernel can be edited here. It will be atomatically saved to the file named in the first line.

In [7]:
%%writefile OpenCL_code/FC.cl

#include "FGPUlib.c"

__kernel void copy_word(const __global float *in_src, __global float *output, __global float *weights, __global float *biases, int input_size) {
    int output_node = get_global_id(0);
    float res = 0.0;
    for (int j=0; j<input_size; j++){
        res += in_src[j] * weights[output_node * input_size + j];
    }
    res += biases[output_node];
    output[output_node]=res;
}

Writing ../OpenCL_code/FC.cl


## Import FGPU package

In [8]:
import sys
sys.path.append("../FGPU/FGPU")
from FGPU import FGPU
fgpu= FGPU()

## Program Hardware

### Download the bitstream to the board

In [11]:
fgpu.set_bitFile("../bitstreams/PYNQ/1CU_fp_100MHz.bit")
fgpu.download_bitstream()

## Compile Kernel

#### compile_kernels parameters: 
0 (true/false) -> output the logs (print compiled code as objdump)

1 (true/false) -> floating point support

In [13]:
fgpu.set_kernel_file("OpenCL_code/FC.cl")
fgpu.compile_kernel(True, True)

Compiling /home/xilinx/jupyter_notebooks/FGPU/notebooks/OpenCL_code/FC.cl
Compiling succeeded!



/home/xilinx/tmp/FGPU_IPython/FGPU/code.bin:	file format ELF32-fgpu

Disassembly of section .text:
copy_word:
       0:	03 00 00 19 	li	r3, 0
       4:	03 00 00 1d 	lui	r3, 0
       8:	01 04 00 11 	addi	r1, r0, 1
       c:	85 00 00 a8 	lp	r5, 4
      10:	a6 04 00 b2 	slt	r6, r5, r1
      14:	64 00 00 a8 	lp	r4, 3
      18:	22 00 00 a8 	lp	r2, 1
      1c:	07 00 00 a0 	lid	r7, 0
      20:	08 00 00 a1 	wgoff	r8, 0
      24:	01 1d 00 10 	add	r1, r8, r7
      28:	06 40 00 63 	bne	r6, r0, 16
      2c:	43 00 00 a8 	lp	r3, 2
      30:	06 00 00 a8 	lp	r6, 0
      34:	a7 04 00 51 	mul	r7, r5, r1
      38:	e7 08 00 21 	slli	r7, r7, 2
      3c:	67 1c 00 10 	add	r7, r3, r7
      40:	03 00 00 19 	li	r3, 0
      44:	03 00 00 1d 	lui	r3, 0

LBB0_3:
      48:	08 1c 00 74 	lw	r8, r7[r0]
      4c:	09 18 00 74 	lw	r9, r6[r0]
      50:	28 21 00 c1 	fmul	r8, r9, r8
      54:	63 20 00 c0 	fadd	r3, r3, r8
      5

## Allocate Memory

#### Transform parameters from float to int (so that it can be interpreted fine and sent via MMIO)

In [15]:
src = allocate(shape=(784,), dtype='int')
out_fc1 = allocate(shape=(100,), dtype='int')
dst = allocate(shape=(10,), dtype='int')
weights_1_64 = allocate(shape=(78400,), dtype='int64')
biases_1_64 = allocate(shape=(100,), dtype='int64')
weights_2_64 = allocate(shape=(1000,), dtype='int64')
biases_2_64 = allocate(shape=(10,), dtype='int64')

weights_1 = allocate(shape=(78400,), dtype='int')
biases_1 = allocate(shape=(100,), dtype='int')
weights_2 = allocate(shape=(1000,), dtype='int')
biases_2 = allocate(shape=(10,), dtype='int')
input_size_1 = 784
input_size_2 = 100

for i in range(0, 100):
    biases_1_64[i]=int(struct.unpack('<I', struct.pack('<f', biases_1_float[i]))[0])
    biases_1[i] = biases_1_64[i].astype(np.int32)
    
for i in range(0, 10):
    biases_2_64[i]=int(struct.unpack('<I', struct.pack('<f', biases_2_float[i]))[0])
    biases_2[i] = biases_2_64[i].astype(np.int32)
    
for i in range(0, 1000):
    weights_2_64[i]=int(struct.unpack('<I', struct.pack('<f', weights_2_float[i]))[0])
    weights_2[i] = weights_2_64[i].astype(np.int32)
    
for i in range(0,784):
    src[i]=int(struct.unpack('<I', struct.pack('<f', src_float[i]))[0])
    
for i in range(0, 78400):
    weights_1_64[i]=int(struct.unpack('<I', struct.pack('<f', weights_1_float[i]))[0])
    weights_1[i] = weights_1_64[i].astype(np.int32)

## Configure Kernel

### Run 1st FC

#### Fill in host code parameters

In [16]:
fgpu.set_paramerter(0, src)
fgpu.set_paramerter(1, out_fc1)
fgpu.set_paramerter(2, weights_1)
fgpu.set_paramerter(3, biases_1)
fgpu.set_paramerter(4, input_size_1)
# setup index space
fgpu.set_num_dimensions(1)
fgpu.set_size(100)
fgpu.set_work_group_size(1)
fgpu.set_offset(0)

#### Execute on FGPU

In [20]:
fgpu.download_kernel()
#execute and wait until finish
execTime = fgpu.execute_kernel()
print ("Execution time =", int(execTime*1000000), "us")

Execution time = 175737 us


ReLu

In [21]:
for i in range(0,100):
    if out_fc1[i] < 0:
        out_fc1[i] = 0

### Run 2nd FC

#### Fill in host code parameters

In [23]:
fgpu.set_paramerter(0, out_fc1)
fgpu.set_paramerter(1, dst)
fgpu.set_paramerter(2, weights_2)
fgpu.set_paramerter(3, biases_2)
fgpu.set_paramerter(4, input_size_2)
# setup index space
fgpu.set_num_dimensions(1)
fgpu.set_size(10)
fgpu.set_work_group_size(1)
fgpu.set_offset(0)

#### Execute on FGPU

In [24]:
fgpu.download_kernel()
#execute and wait until finish
execTime = fgpu.execute_kernel()
print ("Execution time =", int(execTime*1000000), "us")

Execution time = 2804 us


#### Check results and do softmax

In [25]:
res_arr = allocate(shape=(10,), dtype='float')

for i in range(10):
    res=struct.unpack('f',dst[i])
    print(res[0])
    res_arr[i]=res[0]
    

-1.8715659379959106
-11.567303657531738
-0.529619038105011
1.7135339975357056
-12.922592163085938
-4.173733234405518
-17.23069190979004
9.489047050476074
-4.112046718597412
0.9857190847396851


In [26]:
import math
sum=0.0
for i in range(0,10):
    sum += math.exp(res_arr[i])
for i in range(0,10):
    res_arr[i]=(math.exp(res_arr[i]))/sum

### Classifier result:

In [27]:
max_val = res_arr[0]
index=0
for i in range(0,10):
    if res_arr[i]>max_val :
        max_val = res_arr[i]
        index=i
print("The digit was classified as: ", index)

The digit was classified as:  7
