# Modelová úloha - hledání kam se dostaneme v grafu pokud můžeme projít n hran

### Vstupní data - list dvojic vrcholů = hrany

In [None]:
import random
n = 10

def vygeneruj_graf(n):
    V = [i for i in range(n)]
    E = []
    start_V = 0
    end_V = 0
    while len(E) < 2*n:
        start_V = end_V
        end_V = random.randint(0, n - 1)
        E.append((start_V, end_V))
    return V, E

V, E = vygeneruj_graf(n)

In [None]:
print(V)
print(E)

### Ochutnávka knihovny networkx = vykreslení grafu

In [None]:
# !pip install networkx

In [None]:
# plot graph with vertices V and edges E
# showing vertices with numbers and connections as lines
import matplotlib.pyplot as plt
import networkx as nx

G = nx.Graph()
G.add_nodes_from(V)
G.add_edges_from(E)
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos)
nx.draw_networkx_edges(G, pos)
nx.draw_networkx_labels(G, pos)
plt.show()


## První návrh - jednoduchý Python, použití setů, listů a union

In [None]:
def reachable_in_n_steps(edges, n):
    reachable = set()
    reachable.add(0)
    for i in range(n):
        new_reachable = set()
        for v in reachable:
            for e in edges:
                if e[0] == v:
                    new_reachable.add(e[1])
                if e[1] == v:
                    new_reachable.add(e[0])
        reachable = reachable.union(new_reachable)
    return list(reachable)


In [None]:
reachable_in_n_steps(E, 2)

## Vygenerujeme větší graf

In [None]:
V, E = vygeneruj_graf(1000)

Jak dlouho to asi potrvá?

In [None]:
%time res1 = reachable_in_n_steps(E, 20)

## Profilování

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f reachable_in_n_steps reachable_in_n_steps(E, 20)

Edge procházíme desítky miliónů krát... tohle ani rychlé být nemůže. Jak to můžeme zrychlit?

- edge který jednou projdeme už nemusíme znovu procházet
- druhý if v cyklu může být v elif, protože pokud je splněn první if, tak druhý už nás stejně nezajímá

## Optimalizace algoritmu

In [None]:
from copy import deepcopy

def reachable_in_n_steps_v2(edges, n):
    edges_copy = deepcopy(edges)
    reachable = set()
    reachable.add(0)
    for i in range(n):
        new_reachable = set()
        for v in reachable:
            edges_to_remove = []
            for e_idx, e in enumerate(edges_copy):
                if e[0] == v:
                    new_reachable.add(e[1])
                    edges_to_remove.append(e_idx)
                elif e[1] == v:
                    new_reachable.add(e[0])
                    edges_to_remove.append(e_idx)
            _ = [edges_copy.pop(e_idx) for e_idx in edges_to_remove[::-1]]
        reachable = reachable.union(new_reachable)
    return list(reachable)


In [None]:
%time res2 = reachable_in_n_steps_v2(E, 20)

Tohle bylo vskutku výrazné zrychlení!

Raději ověříme, že počítáme stále to samé:

In [None]:
import numpy as np
np.allclose(np.array(res1), np.array(res2))

## První optimalizace - použití NumPy a pole bool hodnot místo setů

Budeme chtít také vstup jako Numpy:

In [None]:
E_np = np.array(E)

In [None]:
def reachable_in_n_steps_np(edges, n):
    edges_copy = edges.copy()
    n_vertices = np.max(edges_copy) + 1
    reachable = np.zeros((n_vertices), dtype=np.bool8)
    reachable[0] = True
    for i in range(n):
        new_reachable = np.zeros((n_vertices), dtype=np.bool8)
        for v, is_reachable in enumerate(reachable):
            if not is_reachable:
                continue
            edges_hits = np.logical_or(edges_copy[:,0] == v, edges_copy[:,1] == v)
            if np.sum(edges_hits) == 0:
                continue
            reachable_vertices = edges_copy[edges_hits,:].ravel()
            new_reachable[reachable_vertices] = True
            edges_copy = edges_copy[~edges_hits,:]
        reachable = np.logical_or(reachable, new_reachable)
        
    return np.where(reachable)

In [None]:
%time res3 = reachable_in_n_steps_np(E_np, 20)

In [None]:
np.allclose(np.array(res1), np.array(res3))

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f reachable_in_n_steps_np reachable_in_n_steps_np(E_np, 20)

In [None]:
from numba import jit
import numpy as np

@jit(nopython=True)
def reachable_in_n_steps_numba(edges, n):
    edges_copy = edges.copy()
    n_vertices = np.max(edges_copy) + 1
    reachable = np.zeros((n_vertices), dtype=np.bool8)
    reachable[0] = True
    for i in range(n):
        new_reachable = np.zeros((n_vertices), dtype=np.bool8)
        for v, is_reachable in enumerate(reachable):
            if not is_reachable:
                continue
            edges_hits = np.logical_or(edges_copy[:,0] == v, edges_copy[:,1] == v)
            if np.sum(edges_hits) == 0:
                continue
            reachable_vertices = edges_copy[edges_hits,:].ravel()
            new_reachable[reachable_vertices] = True
            edges_copy = edges_copy[~edges_hits,:]
        reachable = np.logical_or(reachable, new_reachable)
        
    return np.where(reachable)

In [None]:
%time res4 = reachable_in_n_steps_numba(E_np, 20)

To už je obrovské zrychlení!

Zkontrolujeme, že počítáme stále to samé:

In [None]:
np.allclose(np.array(res1), np.array(res4))

## Zkusíme to úplně jinak - použití matice sousednosti

In [None]:
from scipy.sparse import csc_matrix

def reachable_in_n_steps_scipy(edges, n):
    n_vertices = np.max(edges) + 1
    idx_row = np.concatenate((edges[:,0], edges[:,1]))
    idx_col = np.concatenate((edges[:,1], edges[:,0]))
    values = np.ones((len(idx_row)), dtype=np.bool_)
   
    adjacence_csc = csc_matrix((values, (idx_row, idx_col)), 
                               shape=(n_vertices, n_vertices), dtype=np.bool_)

    reachable = np.zeros((n_vertices), dtype=np.bool_)
    reachable[0] = True
    for i in range(n):
        reachable = adjacence_csc.dot(reachable)

    return np.where(reachable)[0]

In [None]:
%time res5 = reachable_in_n_steps_scipy(E_np, 20)

To je rychlé!

Zkontrolujeme, že počítáme stále to samé:

In [None]:
np.allclose(np.array(res1), res5)

In [None]:
# profilovani
%lprun -f reachable_in_n_steps_scipy reachable_in_n_steps_scipy(E_np, 20)

## Benchmarkování nejlepších variant

### Rostoucí n

In [None]:
import time
num_vert = 2000
V, E = vygeneruj_graf(num_vert)
E_np = np.array(E)

n_list = [2**i for i in range(0, 9)]
times_v2 = [] # reachable_in_n_steps_v2
times_np = [] # reachable_in_n_steps_np
times_numba = [] # reachable_in_n_steps_numba
times_scipy = [] # reachable_in_n_steps_scipy

for n in n_list:
    start = time.time()
    res = reachable_in_n_steps_v2(E, n)
    end = time.time()
    times_v2.append(end - start)
    
    start = time.time()
    res = reachable_in_n_steps_np(E_np, n)
    end = time.time()
    times_np.append(end - start)
    
    start = time.time()
    res = reachable_in_n_steps_numba(E_np, n)
    end = time.time()
    times_numba.append(end - start)
    
    start = time.time()
    res = reachable_in_n_steps_scipy(E_np, n)
    end = time.time()
    times_scipy.append(end - start)

    print(n, times_v2[-1], times_np[-1], times_numba[-1], times_scipy[-1])

# plot logaritmic scale on y axis
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.loglog(n_list, times_v2, label='v2')
plt.loglog(n_list, times_np, label='np')
plt.loglog(n_list, times_numba, label='numba')
plt.loglog(n_list, times_scipy, label='scipy')
plt.legend()

### Rostoucí pořet vrcholů

In [None]:
import time
n = 200
num_vert_list = [2**i for i in range(5, 14)]
times_v2 = [] # reachable_in_n_steps_v2
times_np = [] # reachable_in_n_steps_np
times_numba = [] # reachable_in_n_steps_numba
times_scipy = [] # reachable_in_n_steps_scipy

for num_vert in num_vert_list:
    V, E = vygeneruj_graf(num_vert)
    E_np = np.array(E) 

    start = time.time()
    res = reachable_in_n_steps_v2(E, n)
    end = time.time()
    times_v2.append(end - start)
    
    start = time.time()
    res = reachable_in_n_steps_np(E_np, n)
    end = time.time()
    times_np.append(end - start)
    
    start = time.time()
    res = reachable_in_n_steps_numba(E_np, n)
    end = time.time()
    times_numba.append(end - start)
    
    start = time.time()
    res = reachable_in_n_steps_scipy(E_np, n)
    end = time.time()
    times_scipy.append(end - start)

    print(num_vert, times_v2[-1], times_np[-1], times_numba[-1], times_scipy[-1])

# plot logaritmic scale on y axis
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.loglog(num_vert_list, times_v2, label='v2')
plt.loglog(num_vert_list, times_np, label='np')
plt.loglog(num_vert_list, times_numba, label='numba')
plt.loglog(num_vert_list, times_scipy, label='scipy')
plt.legend()