# Modelová úloha - vzdálenosti mezi body v 3D
Zadání: pro N bodů v 3 rozměrném prostoru spočítejte vzájemnou vzdálenost $d$, která je pro dva body $x,y$ definovaná jako $\sqrt {\sum_{i=1}^3 {{{\left( {{x_i} - {y_i}} \right)}^2}} } $. Výslekem je tedy (symetrická) matice $N\times N$.

In [None]:
import numpy as np
points_count = 1000
points_np = np.random.rand(points_count, 3)

## Numba - sériová verze

Viz minulá hodina.

In [None]:
import math
import numba

@numba.jit(nopython=True)
def dist_numba(points):
    n = points.shape[0]
    distances_all = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            tmp_sum = 0
            for k in range(3):
                rozdil = points[i, k] - points[j, k]
                tmp_sum += rozdil*rozdil
            tmp_dist = math.sqrt(tmp_sum)
            distances_all[i, j] = tmp_dist

    return distances_all

In [None]:
%time _ = dist_numba(points_np)

In [None]:
%timeit _ = dist_numba(points_np)

## Cython - sériová verze
- použijeme Jupyter cell magic `%%cython`
- `--compile-args=-O3` - optimalizace při kompilaci
- `--compile-args=-w` - potlačení warningů (deprecated API)

In [None]:
%load_ext Cython

In [None]:
%%cython --compile-args=-O3 --compile-args=-w

import numpy as np  # pythoní numpy
cimport numpy as cnp  # cythoní numpy pro statické typování
from libc.math cimport sqrt
cimport cython

ctypedef cnp.float64_t DTYPE_t  # alias pro typ float64

@cython.boundscheck(False)  # vypnutí kontrol přístupu mimo pole -> chyba = segfault
@cython.wraparound(False)  # vypnutí záporných indexů -> chyba = segfault
def dist_cython(cnp.ndarray[DTYPE_t, ndim=2] points):
    cdef int n = points.shape[0]
    cdef cnp.ndarray[DTYPE_t, ndim=2] distances_all = np.zeros((n, n), dtype=np.float64)
    cdef int i, j, k  # všechny iterační proměnné musí být definované
    cdef DTYPE_t tmp_sum, rozdil, tmp_dist  # pomocné proměnné

    for i in range(n):
        for j in range(n):
            tmp_sum = 0
            for k in range(3):
                rozdil = points[i, k] - points[j, k]
                tmp_sum += rozdil * rozdil
            tmp_dist = sqrt(tmp_sum)
            distances_all[i, j] = tmp_dist

    return distances_all

In [None]:
%timeit _ = dist_cython(points_np)

In [None]:
# vyzkoušíme si, jestli to funguje
res1 = dist_numba(points_np)
res2 = dist_cython(points_np)
np.allclose(res1, res2)

## Porovnání nejlepších variant

In [None]:
import time
import matplotlib.pyplot as plt

velikosti = [2**i for i in range(6, 15)]
time_numba = []
time_cython = []

for n in velikosti:
    points_np = np.random.rand(n, 3)

    start = time.time()
    _ = dist_numba(points_np)
    time_numba.append(time.time() - start)

    start = time.time()
    _ = dist_cython(points_np)
    time_cython.append(time.time() - start)

    print(n, time_numba[-1], time_cython[-1])

# log-log grafy
plt.loglog(velikosti, time_numba, label="numba")
plt.loglog(velikosti, time_cython, label="cython")
plt.xlabel("Počet bodů")
plt.ylabel("Čas [s]")
plt.grid()
plt.title("Porovnání rychlosti numba vs cython")
plt.legend()

# Paralelní implementace:

In [None]:
points_count = 1000
points_np = np.random.rand(points_count, 3)

## Numba - paralelní verze

- stačí nám tag `parallel=True` a `prange` místo `range`

In [None]:
@numba.jit(nopython=True, parallel=True)
def dist_numba_parallel(points):
    n = points.shape[0]
    distances_all = np.zeros((n, n))
    for i in numba.prange(n):
        for j in numba.prange(n):
            tmp_sum = 0
            for k in range(3):
                rozdil = points[i, k] - points[j, k]
                tmp_sum += rozdil*rozdil
            tmp_dist = math.sqrt(tmp_sum)
            distances_all[i, j] = tmp_dist

    return distances_all

In [None]:
%time _ = dist_numba_parallel(points_np)

In [None]:
%timeit _ = dist_numba_parallel(points_np)

## Cython - paralelní verze

Verze s redundatním výpočtem vzdálenosti mezi body, bude se lépe paralelizovat.

- použijeme `prange()`
- přidáme kompilační argumenty `--compile-args=-fopenmp` pro použití OpenMP, a `--link-args=-fopenmp` pro linkování s OpenMP knihovnou

In [None]:
%%cython --compile-args=-O3  --compile-args=-w --compile-args=-fopenmp --link-args=-fopenmp

import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt
cimport cython
from cython.parallel import prange # toto je zde nové

ctypedef cnp.float64_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef dist_cython_parallel(cnp.ndarray[DTYPE_t, ndim=2] points, int n_threads):
    cdef int n = points.shape[0]
    cdef cnp.ndarray[DTYPE_t, ndim=2] distances_all = np.zeros((n, n), dtype=np.float64)
    cdef int i, j, k
    cdef DTYPE_t tmp_sum, rozdil, tmp_dist


    for i in prange(n, nogil=True, num_threads=n_threads): # toto je zde nové
        for j in prange(n, num_threads=n_threads):
            tmp_sum = 0  # Reset tmp_sum for each new pair of points
            for k in range(3):
                rozdil = points[i, k] - points[j, k]
                tmp_sum = tmp_sum + rozdil * rozdil # toto je zde nové
            tmp_dist = sqrt(tmp_sum)
            distances_all[i, j] = tmp_dist

    return distances_all

In [None]:
%timeit _ = dist_cython_parallel(points_np,8)

In [None]:
# vyzkoušíme si, jestli to funguje
res1 = dist_numba_parallel(points_np)
res2 = dist_cython_parallel(points_np, 16)
np.allclose(res1, res2)

# Benchmarking

In [None]:
import time
import matplotlib.pyplot as plt

pocet_bodu =[2**i for i in range(10, 14)]
time_cython_serial = []
time_numba_serial = []

all_num_threads = [4, 8, 16]
time_cython_parallel = [[] for _ in all_num_threads]
time_numba_parallel = [[] for _ in all_num_threads]



n_loops = 10

def measure_multi(n,func):
    tmp_time = []
    for i in range(n):
        start = time.time()
        _ = func(points_np)
        tmp_time.append(time.time() - start)
    return min(tmp_time)

for n in pocet_bodu:
    points_np = np.random.rand(n, 3)
    
    time_cython_serial.append(measure_multi(n_loops, dist_cython))
    time_numba_serial.append(measure_multi(n_loops, dist_numba))

    for idx, n_threads in enumerate(all_num_threads):
        cython_par_test_wrap = lambda points_np: dist_cython_parallel(points_np, n_threads)
        time_cython_parallel[idx].append(measure_multi(n_loops, cython_par_test_wrap))
        numba.set_num_threads(n_threads)
        time_numba_parallel[idx].append(measure_multi(n_loops, dist_numba_parallel))


    min_time = min([time_cython_parallel[idx][-1] for idx, _ in enumerate(all_num_threads)])
    ratio = time_numba_serial[-1]/min_time
    print(f"velikost {n}, čas sekvenční {time_cython_serial[-1]}, nejlepší paralelní {min_time}, poměr {ratio}")

# log-log grafy
plt.loglog(pocet_bodu, time_cython_serial, label='cython serial')
plt.loglog(pocet_bodu, time_numba_serial, label='numba serial', linestyle=':')
for idx, n_threads in enumerate(all_num_threads):
    plt.loglog(pocet_bodu, time_cython_parallel[idx], label='cython parallel' + str(n_threads))
    plt.loglog(pocet_bodu, time_numba_parallel[idx], label='numba parallel' + str(n_threads), linestyle=':')

plt.legend()
plt.grid()
plt.xlabel("Počet bodů")
plt.ylabel("Čas [s]")
plt.title("Porovnání rychlosti numba vs cython")
plt.show()