Columnwise estimation method for inverse Cov
---------------------------------------------

* The idea:

Ax_k = e_k ->  x_k = A^(-1)e_k

Solve x_k

* Thoughts

Pitäisi olla yksikäsitteinen ratk, koska n tuntematonta ja n yhtälöä

In [3]:
# imports

import numpy as np
import xarray as xr
import scipy.sparse.linalg 
import dask
from dask.distributed import Client, progress
import dask.array as da




invm = np.linalg.inv
gmres = scipy.sparse.linalg.gmres
npsolve = np.linalg.solve

ds_1 = xr.open_dataset("data/regions_verify_isotope_202112_cov.nc")
bio_1 = ds_1["covariance_bio"]
anth_1 = ds_1["covariance_anth"]

ds_2 = xr.open_dataset("data/regions_verify_202104_cov.nc", chunks = 'auto')
bio_2 = ds_2["covariance_bio"]
anth_2 = ds_2["covariance_anth"]

M = bio_2
M

Unnamed: 0,Array,Chunk
Bytes,358.83 MiB,214.31 MiB
Shape,"(6858, 6858)","(4096, 6858)"
Count,2 Graph Layers,2 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 358.83 MiB 214.31 MiB Shape (6858, 6858) (4096, 6858) Count 2 Graph Layers 2 Chunks Type float64 numpy.ndarray",6858  6858,

Unnamed: 0,Array,Chunk
Bytes,358.83 MiB,214.31 MiB
Shape,"(6858, 6858)","(4096, 6858)"
Count,2 Graph Layers,2 Chunks
Type,float64,numpy.ndarray


In [2]:
invM = invm(M)


np.linalg.solve
----------------
* ainakin näyttäisi tulevan samat sarakevektorit, mutta se ei (kai) käytä GMRESiä

In [19]:
i = 4 #which column to solve
e_i = np.zeros(len(M))
e_i[i]= 1.

s_i = np.linalg.solve(M, e_i)
print(s_i)
c_i = invM[:,i]

print(np.allclose(c_i, s_i))

[-4.11504027e-01 -4.28670605e-10  8.31936008e-09 ...  0.00000000e+00
  0.00000000e+00  0.00000000e+00]
True


scipy.sparse.linalg.gmres
-----------------------------

* needs a smaller tolerance than the default for the resulting vector to be close to the actual column of the inverse
* A way to decrease the number of iterations is to use a preconditioner - maybe need to look into this if not fast enough!

In [20]:
g_i, exitCode = gmres(M,e_i, tol = 1e-12)

print(g_i)
print(exitCode)


[-4.11504027e-01 -4.28670908e-10  8.31936012e-09 ...  0.00000000e+00
  0.00000000e+00  0.00000000e+00]
0


Comparison of times it takes to solve one column
--------------------------------------------------

* scipyn gmres a lot faster than numpy solve at least with tolerance 1e-12 or larger
* not as exact though (if we presume invm computes the correct inverse. (It could also be that invm uses the same np.linalg.solve and that is the reason it gives the same column as solve)

In [21]:
i = 5 #which column to solve
e_i = np.zeros(len(M))
e_i[i]= 1.

%timeit npsolve(M, e_i)
%timeit gmres(M,e_i, tol = 1e-12)

1.39 s ± 19.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
971 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
def invert_column_separately(A,idx, tolerance):
    e_i = np.zeros(len(A))
    e_i[idx]= 1.
    invcol, exitcode = gmres(A, e_i, tol = tolerance)   
    return invcol, exitcode
    

The whole inverse matrix
------------------------

When parallelizing this, we need to take into account the fact that the columns need to be assembled in the correct order after they have been computed

In [5]:

def invert_column(A, n, idx, tolerance):
   e_i = np.zeros(n)
   e_i[idx] = 1.
   invcol, exitcode = gmres(A, e_i, tol = tolerance)   
   return invcol, exitcode


n = len(M)
Minv = np.zeros((n,n))
exitcodes = np.ones(n)
for i in np.arange(n):
   #tähän joku decorator dask.delayed ?
   Minv[:,i], exitcodes[i] = invert_column(M, n, i, 1e-10)





In [None]:


n = len(M)


In [4]:
client = Client(threads_per_worker=8, n_workers=1)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 8,Total memory: 30.78 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36965,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 30.78 GiB

0,1
Comm: tcp://127.0.0.1:32947,Total threads: 8
Dashboard: http://127.0.0.1:36687/status,Memory: 30.78 GiB
Nanny: tcp://127.0.0.1:38807,
Local directory: /tmp/dask-worker-space/worker-qd62vb0q,Local directory: /tmp/dask-worker-space/worker-qd62vb0q


In [10]:
import dask
n = len(M)
inverted_cols = []
for i in np.arange(n):
   invcol = dask.delayed(invert_column)(M,i, 1e-10)
   inverted_cols.append(invcol)

futures = dask.persist(*inverted_cols)

#client.cluster.scale(10)

results = dask.compute(*futures)

  (array([[ 8.00000000e-01,  1.20738289e-08,  3.2659 ... ), 1999, 1e-10)
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


KeyboardInterrupt: 

In [16]:
print(Minv)
print(exitCodes)

[[ 1.25426569e+00 -3.52374850e-02 -6.35396420e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-3.52374850e-02  1.25182439e+00 -1.33715847e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-6.35396420e-02 -1.33715847e-02  1.26289885e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  5.00079700e+00
  -6.31019841e-02  4.64396874e-07]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ... -6.31019841e-02
   5.00079671e+00 -2.09457564e-05]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  4.64313226e-07
  -2.09457041e-05  5.00489954e+00]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [17]:
print(np.allclose(invM, Minv))

True
