In [1]:
from GmGM import GmGM, Dataset
import anndata as ad
import scipy.sparse as sparse
import scanpy as sc
import numpy as np
import dask.dataframe as dd
import dask.array as da

How to construct `data/adata.h5ad` (`data/adata_train.parquet` should come from composition website on Kaggle.)
```python
data = dd.read_parquet("../data/adata_train.parquet")
data["obs_id"] = data["obs_id"].astype("category").cat.as_known()
data["gene"] = data["gene"].astype("category").cat.as_known()
data["obs_code"] = data["obs_id"].cat.codes
data["gene_code"] = data["gene"].cat.codes
count_matrix = sparse.coo_array(
    (
        data["normalized_count"].compute().values,
        (data["obs_code"].compute().values, data["gene_code"].compute().values)
    )
)
adata = ad.AnnData(
    X=sparse.csr_matrix(count_matrix),
    obs=data["obs_id"].cat.categories.values.to_numpy(),
    var=data["gene"].cat.categories.values.to_numpy(),
)
adata.obs["obs_id"] = adata.obs[0]
adata.var["gene"] = adata.var[0]
```

In [2]:
adata = ad.read_h5ad("../data/adata.h5ad")
adata

AnnData object with n_obs × n_vars = 240090 × 21255
    obs: 'obs_id'
    var: 'gene'

In [3]:
GmGM(
    adata,
    to_keep={
        "obs": 0,
        "var": 0
    },
    n_comps=20,
    verbose=True,
    readonly=False,
    # don't center so that data is not densified
    centering_method=None
)

Centering...
Calculating eigenvectors...
	by calculating SVD...
Calculating eigenvalues...
@0: inf - Line Search Gave Up!
Converged! (@0: inf)
Recomposing sparse precisions...


  err_diff: float = np.abs(prev_err - err)


Converting back to AnnData...


AnnData object with n_obs × n_vars = 240090 × 21255
    obs: 'obs_id'
    var: 'gene'
    uns: 'obs_neighbors_gmgm', 'var_neighbors_gmgm'
    obsp: 'obs_gmgm_connectivities'
    varp: 'var_gmgm_connectivities'

In [4]:
# Reset...
del adata
adata = ad.read_h5ad("../data/adata.h5ad")
adata

# Time
print("Timing...")
%timeit -r 1 -n 1 GmGM(adata, to_keep={"obs": 0, "var": 0}, n_comps=20, readonly=False, centering_method=None)

Timing...


  err_diff: float = np.abs(prev_err - err)


9min 43s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [5]:
# Reset...
del adata
adata = ad.read_h5ad("../data/adata.h5ad")
adata

# Time
print("Timing...")
%timeit -r 1 -n 1 GmGM(adata, to_keep={"obs": 0, "var": 0}, n_comps=20, readonly=False, centering_method=None, dont_recompose={"obs"})

Timing...
3min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [6]:
del adata