# Copy Kernel
This kernel simply operates like a DMA. It copies a region of a given size within the global memory.

## Kernel Code
The OpenCL code of your kernel can be edited here. It will be atomatically saved to the file named in the first line.

In [2]:
%%writefile OpenCL_code/copy.cl

#include "FGPUlib.c"

__kernel void copy_word(__global int *in, __global int *out) {
    int index = get_global_id(0);
    out[index] = in[index];
}

Overwriting ../OpenCL_code/copy.cl


## Create Objects

In [3]:
import sys
sys.path.append("../FGPU/FGPU")
from FGPU import FGPU

fgpu= FGPU()

## Program Hardware

In [4]:
fgpu.set_bitFile("../bitstreams/PYNQ/1CU_fp_100MHz.bit")
fgpu.download_bitstream()

## Compile Kernel

#### compile_kernels parameters: 
0 (true/false) -> output the logs (print compiled code as objdump)

1 (true/false) -> floating point support

In [6]:
fgpu.set_kernel_file("OpenCL_code/copy.cl")
fgpu.compile_kernel(True, False)

Compiling /home/xilinx/jupyter_notebooks/FGPU/notebooks/OpenCL_code/copy.cl
Compiling succeeded!



/home/xilinx/tmp/FGPU_IPython/FGPU/code.bin:	file format ELF32-fgpu

Disassembly of section .text:
copy_word:
       0:	22 00 00 a8 	lp	r2, 1
       4:	03 00 00 a8 	lp	r3, 0
       8:	04 00 00 a0 	lid	r4, 0
       c:	05 00 00 a1 	wgoff	r5, 0
      10:	a1 10 00 10 	add	r1, r5, r4
      14:	23 0c 00 74 	lw	r3, r3[r1]
      18:	23 08 00 7c 	sw	r3, r2[r1]
      1c:	00 00 00 92 	ret



## Allocate Memory

In [7]:
from pynq import allocate
length = 256*1024 # length of input and output array
src = allocate(shape=(length,), dtype='int')
dst = allocate(shape=(length,), dtype='int')

## Initialize Memory

In [8]:
# initialize the region to be copied with some content
for i in range(0, length):
    src[i] = i
    dst[i] = 0

## Configure Kernel

In [9]:
# bind allocated memory to kenel parameters
fgpu.set_paramerter(0, src)
fgpu.set_paramerter(1, dst)
# setup index space
fgpu.set_num_dimensions(1)
fgpu.set_size(length)
fgpu.set_work_group_size(64)
fgpu.set_offset(0)

## Execute on FGPU

In [10]:
#download kernel binary code and settings to hardware
fgpu.download_kernel()

#execute and wait until finish
execTime = fgpu.execute_kernel()
print ("Execution time =", int(execTime*1000000), "us")

Execution time = 30265 us


## Execute with memcopy (in the ARM)
Here the same task will be executed in python

In [11]:
import time
# allocate destination
dst2 = allocate(shape=(length,), dtype='int')

#execute and measure time
start = time.time()
for i in range (0,length):
    dst2[i]=src[i]
end = time.time()
print ("Execution time =", int((end-start)*1000000), "us")

Execution time = 1005935 us
