# 1. Modelová úloha: vzdálenosti mezi body ve 3D

Pro `N` bodů ve 3D chceme spočítat matici všech vzájemných vzdáleností:

$ d(x, y) = \sqrt{\sum_{i=1}^{3}(x_i - y_i)^2}. $

Výstupem je symetrická matice `N x N`.

In [None]:
import numpy as np

points_count = 1000
points_np = np.random.rand(points_count, 3)

## 1.1 Numba: sekvenční verze

Nejdřív použijeme přímou implementaci trojité smyčky.

In [None]:
import math
import numba

@numba.jit(nopython=True)
def dist_numba(points):
    n = points.shape[0]
    distances = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            sum_sq = 0.0
            for k in range(3):
                diff = points[i, k] - points[j, k]
                sum_sq += diff * diff
            distances[i, j] = math.sqrt(sum_sq)

    return distances

In [None]:
%time _ = dist_numba(points_np)

In [None]:
%timeit _ = dist_numba(points_np)

## 1.2 Cython: sekvenční verze

- použijeme Jupyter magic `%%cython`
- `--compile-args=-O3` zapne optimalizaci při kompilaci
- `--compile-args=-w` potlačí varování kompilátoru

In [None]:
%load_ext cython

In [None]:
%%cython --compile-args=-O3 --compile-args=-w

import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt
cimport cython

ctypedef cnp.float64_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
def dist_cython(cnp.ndarray[DTYPE_t, ndim=2] points):
    cdef int n = points.shape[0]
    cdef cnp.ndarray[DTYPE_t, ndim=2] distances = np.zeros((n, n), dtype=np.float64)
    cdef int i, j, k
    cdef DTYPE_t sum_sq, diff

    for i in range(n):
        for j in range(n):
            sum_sq = 0.0
            for k in range(3):
                diff = points[i, k] - points[j, k]
                sum_sq += diff * diff
            distances[i, j] = sqrt(sum_sq)

    return distances

In [None]:
%timeit _ = dist_cython(points_np)

In [None]:
# ověření shody výsledků
res1 = dist_numba(points_np)
res2 = dist_cython(points_np)
np.allclose(res1, res2)

## 1.3 Porovnání sekvenčních variant

In [None]:
import matplotlib.pyplot as plt
import time

velikosti = [2**i for i in range(6, 12)]
time_numba = []
time_cython = []

for n in velikosti:
    points_np = np.random.rand(n, 3)

    start = time.time()
    _ = dist_numba(points_np)
    time_numba.append(time.time() - start)

    start = time.time()
    _ = dist_cython(points_np)
    time_cython.append(time.time() - start)

    print(n, time_numba[-1], time_cython[-1])

plt.loglog(velikosti, time_numba, label="numba")
plt.loglog(velikosti, time_cython, label="cython")
plt.xlabel("Počet bodů")
plt.ylabel("Čas [s]")
plt.grid()
plt.title("Porovnání rychlosti: numba vs cython")
plt.legend()
plt.show()

# 2. Paralelní implementace

In [None]:
points_count = 1000
points_np = np.random.rand(points_count, 3)

## 2.1 Numba: paralelní verze

Použijeme `parallel=True` a `prange` vnější smyčky.

In [None]:
@numba.jit(nopython=True, parallel=True)
def dist_numba_parallel(points):
    n = points.shape[0]
    distances = np.zeros((n, n))
    for i in numba.prange(n):
        for j in range(n):
            sum_sq = 0.0
            for k in range(3):
                diff = points[i, k] - points[j, k]
                sum_sq += diff * diff
            distances[i, j] = math.sqrt(sum_sq)

    return distances

In [None]:
%time _ = dist_numba_parallel(points_np)

In [None]:
%timeit _ = dist_numba_parallel(points_np)

## 2.2 Cython: paralelní verze

Pro jednodušší paralelizaci budeme počítat celý prostor `N x N` (bez využití symetrie).

- použijeme `prange()`
- přidáme `--compile-args=-fopenmp` a `--link-args=-fopenmp` pro OpenMP

In [None]:
%%cython --compile-args=-O3 --compile-args=-w --compile-args=-fopenmp --link-args=-fopenmp

import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt
cimport cython
from cython.parallel import prange

ctypedef cnp.float64_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef dist_cython_parallel(cnp.ndarray[DTYPE_t, ndim=2] points, int n_threads):
    cdef int n = points.shape[0]
    cdef cnp.ndarray[DTYPE_t, ndim=2] distances = np.zeros((n, n), dtype=np.float64)
    cdef int i, j, k
    cdef DTYPE_t sum_sq, diff

    for i in prange(n, nogil=True, num_threads=n_threads):
        for j in range(n):
            sum_sq = 0.0
            for k in range(3):
                diff = points[i, k] - points[j, k]
                sum_sq = sum_sq + diff * diff
            distances[i, j] = sqrt(sum_sq)

    return distances

In [None]:
%timeit _ = dist_cython_parallel(points_np, 8)

In [None]:
# ověření shody výsledků
res1 = dist_numba_parallel(points_np)
res2 = dist_cython_parallel(points_np, 8)
np.allclose(res1, res2)

# 3. Benchmark

In [None]:
import os
import time
import matplotlib.pyplot as plt

pocet_bodu = [2**i for i in range(8, 12)]
time_cython_serial = []
time_numba_serial = []

max_threads = os.cpu_count() or 1
all_num_threads = [n for n in (2, 4, 8) if n <= max_threads]
if not all_num_threads:
    all_num_threads = [1]

time_cython_parallel = [[] for _ in all_num_threads]
time_numba_parallel = [[] for _ in all_num_threads]

n_loops = 3


def measure_multi(n, func, points):
    tmp_time = []
    for _ in range(n):
        start = time.time()
        _ = func(points)
        tmp_time.append(time.time() - start)
    return min(tmp_time)


for n in pocet_bodu:
    points_np = np.random.rand(n, 3)

    time_cython_serial.append(measure_multi(n_loops, dist_cython, points_np))
    time_numba_serial.append(measure_multi(n_loops, dist_numba, points_np))

    for idx, n_threads in enumerate(all_num_threads):
        cython_parallel_fn = lambda points, t=n_threads: dist_cython_parallel(points, t)

        def numba_parallel_fn(points, t=n_threads):
            numba.set_num_threads(t)
            return dist_numba_parallel(points)

        time_cython_parallel[idx].append(measure_multi(n_loops, cython_parallel_fn, points_np))
        time_numba_parallel[idx].append(measure_multi(n_loops, numba_parallel_fn, points_np))

    best_parallel = min(time_cython_parallel[idx][-1] for idx in range(len(all_num_threads)))
    ratio = time_cython_serial[-1] / best_parallel
    print(
        f"velikost {n}, cython serial {time_cython_serial[-1]:.4f}s, "
        f"nejlepší cython parallel {best_parallel:.4f}s, poměr {ratio:.2f}x"
    )

plt.loglog(pocet_bodu, time_cython_serial, label="cython serial")
plt.loglog(pocet_bodu, time_numba_serial, label="numba serial", linestyle=":")
for idx, n_threads in enumerate(all_num_threads):
    plt.loglog(pocet_bodu, time_cython_parallel[idx], label=f"cython parallel ({n_threads})")
    plt.loglog(
        pocet_bodu,
        time_numba_parallel[idx],
        label=f"numba parallel ({n_threads})",
        linestyle=":"
    )

plt.legend()
plt.grid()
plt.xlabel("Počet bodů")
plt.ylabel("Čas [s]")
plt.title("Porovnání rychlosti: numba vs cython")
plt.show()