In [1]:
# verify median implementation, compare to cpu reference
# load median gpu implementation
%run median_gpu.ipynb

import time

import numpy as np
from math import ceil 

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#cpu version: function
@numba.njit
def numba_median(array):
    return np.median(array)

@numba.njit
def cpu_smooth_kernel(input_data, output_data, fkt, stencil):
    # data size
    Nz, Ny, Nx = np.shape(input_data)
    # stencil size
    dx = stencil[0]
    dy = stencil[1]
    dz = stencil[2]
    # run over all data
    for z_id in range(Nz):
        z_min = max(z_id-dz, 0)
        z_max = min(z_id+dz+1, Nz)
        for y_id in range(Ny):
            y_min = max(y_id-dy, 0)
            y_max = min(y_id+dy+1, Ny)
            for x_id in range(Nx):
                x_min = max(x_id-dx, 0)
                x_max = min(x_id+dx+1, Nx)

                output_data[z_id, y_id, x_id] = fkt(input_data[z_min:z_max, y_min:y_max, x_min:x_max])

In [3]:
#generate dummy data for test
Nz, Ny, Nx = 10,10,10
real_data = (np.ones((Nz, Ny, Nx)) * np.sin(np.linspace(0, 20, Ny)[np.newaxis, :, np.newaxis])*0.5 
             + np.random.normal(scale=1.0, size=(Nz, Ny, Nx)) )

In [4]:
# define the output result: same size of input data
output_data_cpu_ = real_data.copy()
output_data_gpu_ = real_data.copy()

In [5]:
stencil_t = np.array([1, 1, 1])
lauch_kernel(real_data, output_data_gpu_, stencil_t)
cpu_smooth_kernel(real_data, output_data_cpu_, numba_median, stencil_t)

threads: (32, 16, 4)
blocks: (8, 8, 8)


In [26]:
# run over all data
# todo clarify relative error
REL_ERR = 1000
err_count = 0
for z_id in range(Nz):
    for y_id in range(Ny):
        for x_id in range(Nx):
            d_err=abs((output_data_gpu_[z_id, y_id, x_id] - output_data_cpu_[z_id, y_id, x_id]) / output_data_cpu_[z_id, y_id, x_id]) * 100
            if d_err > REL_ERR:
                err_count +=1
                if err_count < 5:
                    print("Missmatch found: cpu(%d,%d,%d) = %s, gpu= %s, percent error = %s" %(z_id,y_id,x_id,output_data_cpu_[z_id, y_id, x_id],output_data_gpu_[z_id, y_id, x_id], d_err))
print("Total error count: %d" % (err_count))     
print("With tolerated relative error (in percent) = %d" % REL_ERR)

Missmatch found: cpu(1,1,6) = 0.0009539857217940365, gpu= 0.08178853243589401, percent error = 8473.34974386042
Missmatch found: cpu(1,3,7) = 0.00435217388836498, gpu= 0.06398070603609085, percent error = 1370.0861610133657
Missmatch found: cpu(1,4,7) = 0.00435217388836498, gpu= 0.05166138336062431, percent error = 1087.024799232744
Missmatch found: cpu(1,8,8) = -0.0050455016417864235, gpu= 0.12729302048683167, percent error = 2622.9011805803702
Total error count: 25
With tolerated relative error (in percent) = 1000
