Note: this example requires data from the LifeLines-DEEP project; you have to request access from them. (https://pubmed.ncbi.nlm.nih.gov/26319774/)

In [1]:
from TeraLasso import TeraLasso
from EiGLasso import EiGLasso
from GmGM import GmGM, Dataset
from GmGM.synthetic.generate_data import threshold_dictionary

import matplotlib.pyplot as plt
import numpy as np
import timeit

from cycler import cycler
linestyle_cycler = cycler('linestyle',['-','--',':','-.'])
color_cycler = cycler('color',['k','orange','m','darkgreen'])
plt.rc('axes', prop_cycle=linestyle_cycler + color_cycler)

import igraph as ig
import pandas as pd

import muon as mu
import anndata as ad
import scanpy as sc
from anndata import AnnData
import sklearn.cluster as clust

from typing import Literal, Union

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


# Setup

## Helper Functions

In [2]:
def calculate_assortativity(
    mudata: Union[mu.MuData, ad.AnnData],
) -> dict[Literal["Phylum", "Class", "Order", "Family", "Genus"], float]:
    global taxmat
    if isinstance(mudata, mu.MuData):
        precmat = mudata["metagenomics_shotgun"].varp[
            "metagenomics_shotgun-var_gmgm_connectivities"
        ].toarray()
    elif isinstance(mudata, ad.AnnData):
        precmat = mudata.varp[
            "var_gmgm_connectivities"
        ].toarray()
    else:
        raise ValueError("mudata must be of type mu.MuData or anndata.AnnData")
    g = ig.Graph.Adjacency(
        precmat != 0,
        mode="undirected",
        loops=False
    )
    return {
        tax_level: g.assortativity(taxmat[tax_level].cat.codes-1)
        for tax_level in ["Phylum", "Class", "Order", "Family", "Genus"]
    }

## Load Data

In [3]:
# Get the taxa
taxmat = pd.read_csv("../data/LL-Deep Data - Processed/ll-deep-taxmat.csv", index_col=0)
taxmat.columns = ["Domain", "Phylum", "Class", "Order", "Family", "Genus"]
taxmat = taxmat.fillna("Unknown")
taxmat = taxmat.apply(lambda x: x.astype("category"))

In [4]:
# Read in the mapping linking person ids from metabolomics and metagenomics
# (we actually constructed the latter two datasets such that the nth row matches
# up in all of them - but this still contains gender info)
map_df = pd.read_csv(
    "../data/LL-Deep Data - Processed/Map.csv",
    index_col=0
)
print(map_df.shape)

# Read in the metabolomics data
metabolomics_df = pd.read_csv(
    "../data/LL-Deep Data - Processed/Metabolomics.csv",
    index_col=0
)
print(metabolomics_df.shape)

# Read in the metagenomics data
metagenomics_df = pd.read_csv(
    "../data/LL-Deep Data - Processed/MetagenomicsShotgun.csv",
    index_col=0
)
old_shape = metagenomics_df.shape

# Only keep the species who appear in more than 20% of the people
#keep_idxs = ((counts > 0).sum(axis=0) > 0.2 * counts.shape[0]).values
#counts = counts.loc[:, keep_idxs]
#taxmat = taxmat.loc[keep_idxs, :]
keep_idxs = (metagenomics_df > 0).sum(axis=0) > 0.2 * metagenomics_df.shape[0]
metagenomics_df = metagenomics_df.loc[:, keep_idxs]
taxmat = taxmat.loc[keep_idxs.values, :]

print(old_shape, '->', metagenomics_df.shape)

(1054, 3)
(1054, 1183)
(1054, 3957) -> (1054, 564)


In [5]:
# Load our data into a MuData object
metabol_ann = AnnData(
    X = metabolomics_df.to_numpy()
)
metabol_ann.obs_names = metabolomics_df.index
metabol_ann.var_names = metabolomics_df.columns

metagen_ann = AnnData(
    X = metagenomics_df.to_numpy()
)
metagen_ann.obs_names = metabolomics_df.index # note this is the same as metabolomics
metagen_ann.var_names = metagenomics_df.columns

mudata = mu.MuData({
    "metabolomics": metabol_ann,
    "metagenomics_shotgun": metagen_ann
})

mudata.obs["Gender"] = map_df["Gender"].to_numpy()

# Log transform the data
sc.pp.log1p(mudata["metabolomics"])
sc.pp.log1p(mudata["metagenomics_shotgun"])

# GmGM

## w/o nonpara

In [6]:
GmGM(
    mudata,
    verbose=True,
    #use_nonparanormal_skeptic=True,
    #nonparanormal_evec_backend="COCA",
    #n_comps=50,
    to_keep={
        "metabolomics-var": 1183 / 1183**2,
        "metagenomics_shotgun-var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    random_state=1,
    threshold_method="overall",
)

Centering...
Calculating eigenvectors...
	by calculating gram matrices and then eigendecomposing...
Calculating eigenvectors for axis='obs'
Calculating eigenvectors for axis='metagenomics_shotgun-var'
Calculating eigenvectors for axis='metabolomics-var'
Calculating eigenvalues...
@0: -1834610.2475790898 (-1834616.8682274285 + 6.620648338769194 + 0) ∆inf
Converged! (@14: -1955700.7343906139)
Recomposing sparse precisions...
Converting back to MuData...


In [7]:
calculate_assortativity(mudata)

{'Phylum': -0.021649610719968463,
 'Class': -0.06197070129673916,
 'Order': -0.18903785084844535,
 'Family': -0.01615142857896596,
 'Genus': -0.03081067929473613}

In [8]:
def to_time() -> None:
    GmGM(
        mudata,
        #use_nonparanormal_skeptic=True,
        #nonparanormal_evec_backend="COCA",
        #n_comps=50,
        to_keep={
            "metabolomics-var": 1183 / 1183**2,
            "metagenomics_shotgun-var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

0.9827490333000007

## w/ nonpara

In [9]:
GmGM(
    mudata,
    verbose=True,
    use_nonparanormal_skeptic=True,
    #nonparanormal_evec_backend="COCA",
    #n_comps=50,
    to_keep={
        "metabolomics-var": 1183 / 1183**2,
        "metagenomics_shotgun-var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    threshold_method="overall",
    random_state=1
)

Centering...
Calculating eigenvectors...
	by calculating gram matrices and then eigendecomposing...
Calculating eigenvectors for axis='obs'
Calculating eigenvectors for axis='metagenomics_shotgun-var'
Calculating eigenvectors for axis='metabolomics-var'
Calculating eigenvalues...
@0: -1835146.444140518 (-1835152.198621132 + 5.754480613893504 + 0) ∆inf
Converged! (@14: -1983139.8097202543)
Recomposing sparse precisions...
Converting back to MuData...


In [10]:
calculate_assortativity(mudata)

{'Phylum': -0.04382048233715373,
 'Class': -0.08540100047093065,
 'Order': -0.1049426423563925,
 'Family': -0.0333123983701039,
 'Genus': -0.0561269868760198}

In [11]:
def to_time() -> None:
    GmGM(
        mudata,
        use_nonparanormal_skeptic=True,
        #nonparanormal_evec_backend="COCA",
        #n_comps=50,
        to_keep={
            "metabolomics-var": 1183 / 1183**2,
            "metagenomics_shotgun-var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

1.1881129832999995

## w/o nonpara, 50pc

In [12]:
GmGM(
    mudata,
    verbose=True,
    #use_nonparanormal_skeptic=True,
    #nonparanormal_evec_backend="COCA",
    n_comps=50,
    to_keep={
        "metabolomics-var": 1183 / 1183**2,
        "metagenomics_shotgun-var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    threshold_method="overall",
    random_state=1
)

Centering...
Calculating eigenvectors...
	by calculating left eigenvectors of concatenated matricizations...
Calculating eigenvalues...
@0: 15500398.362697743 (-3379.8910024504357 + 15503778.253700193 + 0) ∆inf
Converged! (@16: 11423130.265023768)
Recomposing sparse precisions...
Converting back to MuData...


In [13]:
calculate_assortativity(mudata)

{'Phylum': 0.2614525267515359,
 'Class': 0.19958371360377242,
 'Order': 0.04892395118395965,
 'Family': 0.0016404972759645417,
 'Genus': 0.04649678763718945}

In [14]:
def to_time() -> None:
    GmGM(
        mudata,
        #use_nonparanormal_skeptic=True,
        #nonparanormal_evec_backend="COCA",
        n_comps=50,
        to_keep={
            "metabolomics-var": 1183 / 1183**2,
            "metagenomics_shotgun-var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

1.2414448541999987

## w/ nonpara, 50pc

In [15]:
GmGM(
    mudata,
    verbose=True,
    use_nonparanormal_skeptic=True,
    nonparanormal_evec_backend="COCA",
    n_comps=50,
    to_keep={
        "metabolomics-var": 1183 / 1183**2,
        "metagenomics_shotgun-var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    threshold_method="overall",
    random_state=1
)

Centering...
Calculating eigenvectors...
	by calculating left eigenvectors of concatenated matricizations...
Calculating eigenvalues...
@0: 2300528.906273792 (-3457.1874391253486 + 2303986.0937129175 + 0) ∆inf
Converged! (@25: 1190751.3934725346)
Recomposing sparse precisions...
Converting back to MuData...


In [16]:
calculate_assortativity(mudata)

{'Phylum': 0.19683212057690364,
 'Class': 0.13712360953919286,
 'Order': -0.039131192939536666,
 'Family': -0.05088236648443,
 'Genus': 0.003559361329858003}

In [17]:
def to_time() -> None:
    GmGM(
        mudata,
        use_nonparanormal_skeptic=True,
        nonparanormal_evec_backend="COCA",
        n_comps=50,
        to_keep={
            "metabolomics-var": 1183 / 1183**2,
            "metagenomics_shotgun-var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

7.8899044585

## w/o nonpara w/o metabol

In [18]:
GmGM(
    mudata["metagenomics_shotgun"],
    verbose=True,
    #use_nonparanormal_skeptic=True,
    #nonparanormal_evec_backend="COCA",
    #n_comps=50,
    to_keep={
        "var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    threshold_method="overall",
    random_state=1
)

Centering...
Calculating eigenvectors...
	by calculating gram matrices and then eigendecomposing...
Calculating eigenvectors for axis='obs'
Calculating eigenvectors for axis='var'
Calculating eigenvalues...
@0: -566088.016171067 (-566094.2859140849 + 6.269743017843583 + 0) ∆inf
Converged! (@10: -589782.8896450901)
Recomposing sparse precisions...
Converting back to AnnData...


AnnData object with n_obs × n_vars = 1054 × 564
    uns: 'log1p', 'metagenomics_shotgun-var_neighbors_gmgm', 'obs_neighbors_gmgm', 'var_neighbors_gmgm'
    obsp: 'obs_gmgm_connectivities'
    varp: 'metagenomics_shotgun-var_gmgm_connectivities', 'var_gmgm_connectivities'

In [19]:
calculate_assortativity(mudata["metagenomics_shotgun"])

{'Phylum': -0.021649610719968463,
 'Class': -0.06197070129673916,
 'Order': -0.18903785084844535,
 'Family': -0.01615142857896596,
 'Genus': -0.03081067929473613}

In [20]:
def to_time() -> None:
    GmGM(
        mudata["metagenomics_shotgun"],
        #use_nonparanormal_skeptic=True,
        #nonparanormal_evec_backend="COCA",
        #n_comps=50,
        to_keep={
            "var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

0.4672064041000027

## w/ nonpara, w/o metabol

In [21]:
GmGM(
    mudata["metagenomics_shotgun"],
    verbose=True,
    use_nonparanormal_skeptic=True,
    #nonparanormal_evec_backend="COCA",
    #n_comps=50,
    to_keep={
        "var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    threshold_method="overall",
    random_state=1
)

Centering...
Calculating eigenvectors...
	by calculating gram matrices and then eigendecomposing...
Calculating eigenvectors for axis='obs'
Calculating eigenvectors for axis='var'
Calculating eigenvalues...
@0: -566118.05331034 (-566124.2453072045 + 6.191996864548244 + 0) ∆inf
Converged! (@11: -590597.1354492947)
Recomposing sparse precisions...
Converting back to AnnData...


AnnData object with n_obs × n_vars = 1054 × 564
    uns: 'log1p', 'metagenomics_shotgun-var_neighbors_gmgm', 'obs_neighbors_gmgm', 'var_neighbors_gmgm'
    obsp: 'obs_gmgm_connectivities'
    varp: 'metagenomics_shotgun-var_gmgm_connectivities', 'var_gmgm_connectivities'

In [22]:
calculate_assortativity(mudata["metagenomics_shotgun"])

{'Phylum': -0.04382048233715373,
 'Class': -0.08540100047093065,
 'Order': -0.1049426423563925,
 'Family': -0.0333123983701039,
 'Genus': -0.0561269868760198}

In [23]:
def to_time() -> None:
    GmGM(
        mudata["metagenomics_shotgun"],
        use_nonparanormal_skeptic=True,
        #nonparanormal_evec_backend="COCA",
        #n_comps=50,
        to_keep={
            "var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

0.46189837929999555

## w/o nonpara, w/o metabol, 50pc

In [24]:
GmGM(
    mudata["metagenomics_shotgun"],
    verbose=True,
    #use_nonparanormal_skeptic=True,
    #nonparanormal_evec_backend="COCA",
    n_comps=50,
    to_keep={
        "var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    threshold_method="overall",
    random_state=1
)

Centering...
Calculating eigenvectors...
	by calculating SVD...
Calculating eigenvalues...
@0: 13946029.171802273 (-1686.31228719868 + 13947715.484089471 + 0) ∆inf
Converged! (@16: 8755477.424389336)
Recomposing sparse precisions...
Converting back to AnnData...


AnnData object with n_obs × n_vars = 1054 × 564
    uns: 'log1p', 'metagenomics_shotgun-var_neighbors_gmgm', 'obs_neighbors_gmgm', 'var_neighbors_gmgm'
    obsp: 'obs_gmgm_connectivities'
    varp: 'metagenomics_shotgun-var_gmgm_connectivities', 'var_gmgm_connectivities'

In [25]:
calculate_assortativity(mudata["metagenomics_shotgun"])

{'Phylum': 0.23863888450400902,
 'Class': 0.18863235577589343,
 'Order': 0.06437958593172599,
 'Family': 0.002148914857695508,
 'Genus': 0.06583250586397618}

In [26]:
def to_time() -> None:
    GmGM(
        mudata["metagenomics_shotgun"],
        #use_nonparanormal_skeptic=True,
        #nonparanormal_evec_backend="COCA",
        n_comps=50,
        to_keep={
            "var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

0.25880185830000074

## w/ nonpara, w/o metabol, 50pc

In [27]:
GmGM(
    mudata["metagenomics_shotgun"],
    verbose=True,
    use_nonparanormal_skeptic=True,
    nonparanormal_evec_backend="COCA",
    n_comps=50,
    to_keep={
        "var": 564 / 564**2,
        "obs": 1200 / 1054**2
    },
    threshold_method="overall",
    random_state=1
)

Centering...
Calculating eigenvectors...
	by calculating left eigenvectors of concatenated matricizations...
Calculating eigenvalues...
@0: 539021.0291699229 (-1718.0169013216764 + 540739.0460712446 + 0) ∆inf
Converged! (@19: 342571.0558740325)
Recomposing sparse precisions...
Converting back to AnnData...


AnnData object with n_obs × n_vars = 1054 × 564
    uns: 'log1p', 'metagenomics_shotgun-var_neighbors_gmgm', 'obs_neighbors_gmgm', 'var_neighbors_gmgm'
    obsp: 'obs_gmgm_connectivities'
    varp: 'metagenomics_shotgun-var_gmgm_connectivities', 'var_gmgm_connectivities'

In [28]:
calculate_assortativity(mudata["metagenomics_shotgun"])

{'Phylum': 0.19593386742804966,
 'Class': 0.15839392895398652,
 'Order': -0.04244534699603407,
 'Family': -0.05299620892711073,
 'Genus': 0.024867127634415793}

In [29]:
def to_time() -> None:
    GmGM(
        mudata["metagenomics_shotgun"],
        use_nonparanormal_skeptic=True,
        nonparanormal_evec_backend="COCA",
        n_comps=50,
        to_keep={
            "var": 564 / 564**2,
            "obs": 1200 / 1054**2
        },
        threshold_method="overall"
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

2.300499075100004

# TeraLasso

## w/o nonpara w/o metabol

In [30]:
ds = Dataset.from_AnnData(mudata["metagenomics_shotgun"])
# Changing this to 94.235 yields 4074, a big difference!
# But 94.24 yields 2774 and 94.23 yields 3014 so it is very nonmonotonic
# And 10 takes 70 minutes!!
TeraLasso(
    ds,
    94.23
    #use_nonparanormal_skeptic=True,
)
test = ds.to_AnnData()
test



AnnData object with n_obs × n_vars = 1054 × 564
    uns: 'log1p', 'metagenomics_shotgun-var_neighbors_gmgm', 'obs_neighbors_gmgm', 'var_neighbors_gmgm'
    obsp: 'obs_gmgm_connectivities'
    varp: 'metagenomics_shotgun-var_gmgm_connectivities', 'var_gmgm_connectivities'

In [31]:
# want 1200 * 2 + 564 = 2964
564 * 564

318096

In [32]:
test.varp["var_gmgm_connectivities"]

<564x564 sparse array of type '<class 'numpy.float64'>'
	with 3014 stored elements in Compressed Sparse Row format>

In [33]:
calculate_assortativity(test)

{'Phylum': -0.05487204114529791,
 'Class': -0.04148075022533486,
 'Order': 0.027646601890228766,
 'Family': 0.003609222295943036,
 'Genus': 0.0017937565985015316}

In [38]:
def to_time() -> None:
    ds = Dataset.from_AnnData(mudata["metagenomics_shotgun"])
    # Changing this to 94.235 yields 4074, a big difference!
    # But 94.24 yields 2774 and 94.23 yields 3014 so it is very nonmonotonic
    # And 10 takes 70 minutes!!
    TeraLasso(
        ds,
        94.23
        #use_nonparanormal_skeptic=True,
    )

np.mean(timeit.repeat(to_time, number=1, repeat=10))

32.88117337499996

## w/ nonpara w/o metabol

In [35]:
ds = Dataset.from_AnnData(mudata["metagenomics_shotgun"])
# Seems to produce no result for 2e-8, full result for 1.75e-8
# 1.9e-8 gives 572, 1.8e-9 gives 576
TeraLasso(
    ds,
    2e-8,
    use_nonparanormal_skeptic=True,
)
test = ds.to_AnnData()
test



AnnData object with n_obs × n_vars = 1054 × 564
    uns: 'log1p', 'metagenomics_shotgun-var_neighbors_gmgm', 'obs_neighbors_gmgm', 'var_neighbors_gmgm'
    obsp: 'obs_gmgm_connectivities'
    varp: 'metagenomics_shotgun-var_gmgm_connectivities', 'var_gmgm_connectivities'

In [36]:
test.varp["var_gmgm_connectivities"]

<564x564 sparse array of type '<class 'numpy.float64'>'
	with 564 stored elements in Compressed Sparse Row format>

In [37]:
calculate_assortativity(test)

{'Phylum': nan, 'Class': nan, 'Order': nan, 'Family': nan, 'Genus': nan}