In [1]:
import numpy as np
import itertools

In [2]:
c = 3

In [3]:
# Generate some data
np.random.seed(42)
lambda1 = np.random.normal(size=(c, c))
lambda2 = np.random.normal(size=(c, c))
lambda3 = np.random.normal(size=(c, c))
G1 = np.random.normal(size=(c, c, c))
G2 = np.random.normal(size=(c, c, c))
U = np.random.normal(size=(c, c, c, c))

In [4]:
def Z_naive(lambda1, lambda2, lambda3, G1, G2, U):
    c = lambda1.shape[0]
    Z = np.zeros(shape=(c, c, c, c))
    for a, b, c, d, e, f, g, h, i, j in itertools.product(*([range(c)]*10)):
        Z[a, h, i, j] += lambda1[a, b]*lambda2[d, e]*lambda3[g, h]*G1[c, b, d]*G2[f, e, g]*U[i, j, c, f]
    return Z

In [6]:
Z = Z_naive(lambda1, lambda2, lambda3, G1, G2, U)
Z.shape

(3, 3, 3, 3)

In [8]:
pa, descri = np.einsum_path('ab, cbd, de, feg, gh, ijcf -> ahij', lambda1, G1, lambda2, G2, lambda3, U)
pa

['einsum_path', (0, 1), (0, 1), (0, 3), (1, 2), (0, 1)]

In [9]:
print(descri)

  Complete contraction:  ab,cbd,de,feg,gh,ijcf->ahij
         Naive scaling:  10
     Optimized scaling:  6
      Naive FLOP count:  3.543e+05
  Optimized FLOP count:  2.431e+03
   Theoretical speedup:  145.740
  Largest intermediate:  8.100e+01 elements
--------------------------------------------------------------------------
scaling                  current                                remaining
--------------------------------------------------------------------------
   4                 cbd,ab->acd                 de,feg,gh,ijcf,acd->ahij
   4                 feg,de->dfg                    gh,ijcf,acd,dfg->ahij
   4                 dfg,gh->dfh                       ijcf,acd,dfh->ahij
   5               dfh,acd->acfh                          ijcf,acfh->ahij
   6             acfh,ijcf->ahij                               ahij->ahij


**Вопрос:** Какое минимальное количество операций?

**Ответ:** $O(\chi^6)$, или 2.431e+03

In [10]:
def Z_tensordot(lambda1, lambda2, lambda3, G1, G2, U):
    a1 = np.tensordot(lambda1, G1, (1, 1)) # ab, cbd -> acd
    a2 = np.tensordot(lambda2, G2, (1, 1)) # de, feg -> dfg
    a3 = np.tensordot(a2, lambda3, 1)      # dfg, gh -> dfh
    a4 = np.tensordot(a1, a3, 1)           # acd, dfh -> acfh
    a5 = np.tensordot(a4,U, ((1,2), (2,3)))# acfh, ijcf
    return a5

In [11]:
# Тест на корректность
np.isclose(
    Z_naive(lambda1, lambda2, lambda3, G1, G2, U),
    Z_tensordot(lambda1, lambda2, lambda3, G1, G2, U)
).all()

True

In [13]:
%%timeit
Z_tensordot(lambda1, lambda2, lambda3, G1, G2, U)

30.3 µs ± 85.5 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [14]:
%%timeit
np.einsum('ab, cbd, de, feg, gh, ijcf -> ahij', lambda1, G1, lambda2, G2, lambda3, U)

1.09 ms ± 1.52 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [15]:
%%timeit
np.einsum(
    'ab, cbd, de, feg, gh, ijcf -> ahij', 
    lambda1, G1, lambda2, G2, lambda3, U,
    optimize = pa
)

78.9 µs ± 393 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [12]:
%%timeit
Z = Z_naive(lambda1, lambda2, lambda3, G1, G2, U)

42.6 ms ± 311 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Just for fun, сравним с аналогичным кодом на Julia.

In [16]:
using TensorOperations, BenchmarkTools
using Random: randn

In [5]:
c = 3

λ1 = randn(c, c)
λ2 = randn(c, c)
λ3 = randn(c, c)

G1 = randn(c, c, c)
G2 = randn(c, c, c)
U  = randn(c, c, c, c);

In [22]:
function Z_tensor_Julia(λ1, λ2, λ3, G1, G2, U)
    @tensor Z[a, h, i, j] := 
        λ1[a,b] * G1[c,b,d] * λ2[d,e] * G2[f,e,g] * λ3[g,h] * U[i,j,c,f]
end

Z_tensor_Julia (generic function with 1 method)

In [23]:
@benchmark Z_tensor_Julia(λ1, λ2, λ3, G1, G2, U)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m16.083 μs[22m[39m … [35m 3.533 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 97.91%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m16.958 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m18.206 μs[22m[39m ± [32m59.672 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m5.60% ±  1.70%

  [39m [39m [39m [39m [39m [39m▁[39m [39m▅[39m▇[39m▇[39m█[34m▆[39m[39m▃[39m [39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m▂[39m▃[39m▄[39m▇[39m█