In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pickle
from eval_utils import cross_boundary_correctness
import matplotlib.pyplot as plt
import pandas as pd
import pyrovelocity
import unitvelo as utv
from os.path import exists
from pyrovelocity.api import train_model
method = 'pyroVelocity_model1'

(Running UniTVelo 0.2.5.1)
2023-05-17 08:54:15


In [2]:
def compute_mean_vector_field(
    pos,
    adata,
    basis="umap",
    n_jobs=1,
    spliced="spliced_pyro",
    raw=False,
):
    scv.pp.neighbors(adata, use_rep="pca")

    adata.var["velocity_genes"] = True

    if spliced == "spliced_pyro":
        if raw:
            ut = pos["ut"]
            st = pos["st"]
            ut = ut / ut.sum(axis=-1, keepdims=True)
            st = st / st.sum(axis=-1, keepdims=True)
        else:
            ut = pos["ut"]
            st = pos["st"]
        adata.layers["spliced_pyro"] = st.mean(0).squeeze()
        # if ('u_scale' in pos) and ('s_scale' in pos): # TODO: two scale for Normal distribution
        if "u_scale" in pos:  # only one scale for Poisson distribution
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] / pos["u_scale"] - st * pos["gamma"]
            ).mean(0)
        else:
            if "beta_k" in pos:
                adata.layers["velocity_pyro"] = (
                    (ut * pos["beta_k"] - pos["st"] * pos["gamma_k"]).mean(0).squeeze()
                )
            else:
                adata.layers["velocity_pyro"] = (
                    ut * pos["beta"] - pos["st"] * pos["gamma"]
                ).mean(0)
        scv.tl.velocity_graph(
            adata, vkey="velocity_pyro", xkey="spliced_pyro", n_jobs=n_jobs
        )
    elif spliced in ["Ms"]:
        ut = adata.layers["Mu"]
        st = adata.layers["Ms"]
        if ("u_scale" in pos) and ("s_scale" in pos):
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] / (pos["u_scale"] / pos["s_scale"]) - st * pos["gamma"]
            ).mean(0)
        else:
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] - pos["st"] * pos["gamma"]
            ).mean(0)
        scv.tl.velocity_graph(adata, vkey="velocity_pyro", xkey="Ms", n_jobs=n_jobs)
    elif spliced in ["spliced"]:
        ut = adata.layers["unspliced"]
        st = adata.layers["spliced"]
        if ("u_scale" in pos) and ("s_scale" in pos):
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] / (pos["u_scale"] / pos["s_scale"]) - st * pos["gamma"]
            ).mean(0)
        else:
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] - pos["st"] * pos["gamma"]
            ).mean(0)
        scv.tl.velocity_graph(
            adata, vkey="velocity_pyro", xkey="spliced", n_jobs=n_jobs
        )

    scv.tl.velocity_embedding(adata, vkey="velocity_pyro", basis=basis)

In [3]:
datasets = ['Pancreas_with_cc', 'HumanDevelopingBrain', 'DentateGyrus' , 'MouseBoneMarrow', 'MouseErythroid', 'HumanBoneMarrow']
data_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_datasets/'
save_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_results/'

In [None]:
for dataset in datasets:
    print(dataset)
    adata = sc.read_h5ad(data_dir + dataset + '/' + dataset + '_anndata.h5ad')
    adata.layers['raw_spliced']   = adata.layers['spliced']
    adata.layers['raw_unspliced'] = adata.layers['unspliced']
    scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=3000)
    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    adata.obs['u_lib_size_raw'] = adata.layers['raw_unspliced'].toarray().sum(-1)
    adata.obs['s_lib_size_raw'] = adata.layers['raw_spliced'].toarray().sum(-1)
    # Model 1
    num_epochs = 1000 # large data
    adata_model_pos = train_model(adata,
                                   max_epochs=num_epochs, svi_train=True, log_every=100,
                                   patient_init=45,
                                   batch_size=4000, use_gpu=0, cell_state='state_info',
                                   include_prior=True,
                                   offset=False,
                                   library_size=True,
                                   patient_improve=1e-3,
                                   model_type='auto',
                                   guide_type='auto_t0_constraint',
                                   train_size=1.0,
                                   num_samples = 30)
    compute_mean_vector_field(adata_model_pos[1], adata)
    scv.tl.velocity_graph(adata, vkey = 'velocity')
    scv.tl.velocity_embedding(adata, vkey = 'velocity')
    fix, ax = plt.subplots(1, 1, figsize = (8, 6))
    scv.pl.velocity_embedding_stream(adata, basis='umap', save = False, vkey='velocity',
                                     show = False, ax = ax)
    plt.savefig(save_dir + 'UMAPs/' + dataset + '_UMAP_pyroVelocity_model1.svg')
    # Calculate performance metrics:
    file = open(data_dir + dataset + '/' + dataset + '_groundTruth.pickle' ,'rb')
    ground_truth = pickle.load(file)
    metrics = utv.evaluate(adata, ground_truth, 'clusters', 'velocity')
    if exists(save_dir + dataset + '_CBDC_scores.csv'):
        tab = pd.read_csv(save_dir + dataset + '_CBDC_scores.csv', index_col = 0)
    else:
        tab = pd.DataFrame(columns = list(metrics['Cross-Boundary Direction Correctness (A->B)'].keys()) + ['Mean'],
                 index = [method])
    cb_score = [np.mean(metrics['Cross-Boundary Direction Correctness (A->B)'][x])
                for x in metrics['Cross-Boundary Direction Correctness (A->B)'].keys()]
    tab.loc[method,:] = cb_score + [np.mean(cb_score)]
    tab.to_csv(save_dir + dataset + '_CBDC_scores.csv')
    metrics = utv.evaluate(adata, ground_truth, 'clusters', 'velocity')
    if exists(save_dir + dataset + '_ICC_scores.csv'):
        tab = pd.read_csv(save_dir + dataset + '_ICC_scores.csv', index_col = 0)
    else:
        tab = pd.DataFrame(columns = list(np.unique(np.concatenate(ground_truth))) + ['Mean'],
                 index = [method])
    icc_score = [np.mean(metrics['In-cluster Coherence'][x]) for x in np.unique(np.concatenate(ground_truth))]
    tab.loc[method,:] = icc_score + [np.mean(icc_score)]
    tab.to_csv(save_dir + dataset + '_ICC_scores.csv')
    fix, ax = plt.subplots(1, 1, figsize = (8, 6))
    scv.pl.velocity_embedding_stream(adata, basis='umap', save = False, vkey='velocity',
                                     show = False, ax = ax)
    plt.savefig(save_dir + 'UMAPs/' + dataset + '_UMAP_' + method + '.svg')

Pancreas_with_cc
Filtered out 20801 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 3000 highly variable genes.
Logarithmized X.
computing neighbors
    finished (0:00:08) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                                                  
[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                         

-----------
auto
auto_t0_constraint
TraceEnum
step    0 loss = 5.35098 patience = 45
step  100 loss = 3.98398 patience = 45
step  200 loss = 3.47209 patience = 45
step  300 loss = 3.12926 patience = 45
step  400 loss = 2.85263 patience = 45
step  500 loss = 2.63958 patience = 44
step  600 loss = 2.48013 patience = 42
step  700 loss = 2.35706 patience = 38
step  800 loss = 2.26031 patience = 42
step  900 loss = 2.18532 patience = 42
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing velocity graph (using 1/64 cores)


  0%|          | 0/3696 [00:00<?, ?cells/s]

    finished (0:00:22) --> added 
    'velocity_pyro_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_pyro_umap', embedded velocity vectors (adata.obsm)
computing velocities
    finished (0:00:01) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/64 cores)


  0%|          | 0/3696 [00:00<?, ?cells/s]

    finished (0:00:09) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
# Cross-Boundary Direction Correctness (A->B)
{('Ngn3 high EP', 'Pre-endocrine'): 0.7565538085839436, ('Pre-endocrine', 'Alpha'): 0.7547919232220048, ('Pre-endocrine', 'Beta'): 0.8045768861420414, ('Pre-endocrine', 'Delta'): 0.5355338545330987, ('Pre-endocrine', 'Epsilon'): -0.0019028248172510622}
Total Mean: 0.5699107295327674
# In-cluster Coherence
{'Alpha': 0.74785024, 'Beta': 0.7475013, 'Delta': 0.8006084, 'Ductal': 0.97006565, 'Epsilon': 0.8516652, 'Ngn3 high EP': 0.954702, 'Ngn3 low EP': 0.9747722, 'Pre-endocrine': 0.8249896}
Total Mean: 0.8590192794799805
# Cross-Boundary Direction Correctness (A->B)
{('Ngn3 high EP', 'Pre-endocrine'): 0.7565538085839436, ('Pre-endocrine', 'Alpha'): 0.7547919232220048, ('Pre-endocrine', 'Beta'): 0.8045768861420414, ('Pre

INFO:scvi.data._anndata:No batch_key inputted, assuming all cells are same batch


[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 


INFO:scvi.data._anndata:No label_key inputted, assuming all cells have same label


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             


INFO:scvi.data._anndata:Using data from adata.layers["raw_unspliced"]


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                               


INFO:scvi.data._anndata:Using data from adata.layers["raw_spliced"]


[34mINFO    [0m Successfully registered anndata object containing [1;36m9443[0m cells, [1;36m3000[0m vars, [1;36m1[0m batches, [1;36m1[0m labels, and [1;36m0[0m       
         proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra continuous covariates.               


INFO:scvi.data._anndata:Successfully registered anndata object containing 9443 cells, 3000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates.


[34mINFO    [0m Please do not further modify adata until model is trained.                                                


INFO:scvi.data._anndata:Please do not further modify adata until model is trained.


-----------
auto
auto_t0_constraint
step    0 loss = 2.37333e+08 patience = 45
step  100 loss = 1.64495e+08 patience = 45
step  200 loss = 1.46978e+08 patience = 44
step  300 loss = 1.37114e+08 patience = 45
step  400 loss = 1.32361e+08 patience = 45
step  500 loss = 1.30253e+08 patience = 43
step  600 loss = 1.29309e+08 patience = 45
step  700 loss = 1.28843e+08 patience = 37
step  800 loss = 1.28565e+08 patience = 34
step  900 loss = 1.28286e+08 patience = 41
computing neighbors
    finished (0:00:02) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing velocity graph (using 1/64 cores)


  0%|          | 0/9443 [00:00<?, ?cells/s]

    finished (0:01:10) --> added 
    'velocity_pyro_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_pyro_umap', embedded velocity vectors (adata.obsm)
computing velocities
    finished (0:00:02) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/64 cores)


  0%|          | 0/9443 [00:00<?, ?cells/s]

    finished (0:00:12) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
# Cross-Boundary Direction Correctness (A->B)
{('Radial Glia', 'Intermediate \n Progenitor'): 0.28143659827798806, ('Intermediate \n Progenitor', 'Newborn \n Excitatory Neuron'): 0.5685454784439549, ('Newborn \n Excitatory Neuron', 'Immature \n Excitatory Neuron'): 0.13248667544783266, ('Immature \n Excitatory Neuron', 'Mature \n Excitatory Neuron'): 0.6534153862776445}
Total Mean: 0.408971034611855
# In-cluster Coherence
{'Immature \n Excitatory Neuron': 0.7528147, 'Intermediate \n Progenitor': 0.8138593, 'Mature \n Excitatory Neuron': 0.76348907, 'Newborn \n Excitatory Neuron': 0.8205157, 'Radial Glia': 0.83167267}
Total Mean: 0.7964702844619751
# Cross-Boundary Direction Correctness (A->B)
{('Radial Glia', 'Intermediate \n Progenitor'): 0.28143659827798806, 

INFO:scvi.data._anndata:No batch_key inputted, assuming all cells are same batch


[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 


INFO:scvi.data._anndata:No label_key inputted, assuming all cells have same label


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             


INFO:scvi.data._anndata:Using data from adata.layers["raw_unspliced"]


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                               


INFO:scvi.data._anndata:Using data from adata.layers["raw_spliced"]


[34mINFO    [0m Successfully registered anndata object containing [1;36m2930[0m cells, [1;36m3000[0m vars, [1;36m1[0m batches, [1;36m1[0m labels, and [1;36m0[0m       
         proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra continuous covariates.               


INFO:scvi.data._anndata:Successfully registered anndata object containing 2930 cells, 3000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates.


[34mINFO    [0m Please do not further modify adata until model is trained.                                                


INFO:scvi.data._anndata:Please do not further modify adata until model is trained.


-----------
auto
auto_t0_constraint
TraceEnum
step    0 loss = 2.84561 patience = 45
step  100 loss = 2.07316 patience = 45
step  200 loss = 1.80606 patience = 45
step  300 loss = 1.64885 patience = 45
step  400 loss = 1.53706 patience = 45
step  500 loss = 1.44726 patience = 44
step  600 loss = 1.37741 patience = 42
step  700 loss = 1.32048 patience = 44
step  800 loss = 1.27551 patience = 42
step  900 loss = 1.23807 patience = 43
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing velocity graph (using 1/64 cores)


  0%|          | 0/2930 [00:00<?, ?cells/s]

    finished (0:00:19) --> added 
    'velocity_pyro_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_pyro_umap', embedded velocity vectors (adata.obsm)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/64 cores)


  0%|          | 0/2930 [00:00<?, ?cells/s]

    finished (0:00:04) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
# Cross-Boundary Direction Correctness (A->B)
{('nIPC', 'Neuroblast'): 0.8801604394518381, ('Neuroblast', 'Granule immature'): 0.4348942569371744, ('Granule immature', 'Granule mature'): 0.11226860338655911, ('Radial Glia-like', 'Astrocytes'): -0.7080742423773644, ('OPC', 'OL'): -0.9715197899676468}
Total Mean: -0.05045414651388791
# In-cluster Coherence
{'Astrocytes': 0.85087585, 'Cajal Retzius': 0.9856572, 'Cck-Tox': 0.9375297, 'Endothelial': 0.9448638, 'GABA': 0.910822, 'Granule immature': 0.83902454, 'Granule mature': 0.79759127, 'Microglia': 0.97058135, 'Mossy': 0.90160036, 'Neuroblast': 0.92450315, 'OL': 0.9506559, 'OPC': 0.9175724, 'Radial Glia-like': 0.8933756, 'nIPC': 0.9384544}
Total Mean: 0.911650538444519
# Cross-Boundary Direction Correctness (A->B

INFO:scvi.data._anndata:No batch_key inputted, assuming all cells are same batch


[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 


INFO:scvi.data._anndata:No label_key inputted, assuming all cells have same label


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             


INFO:scvi.data._anndata:Using data from adata.layers["raw_unspliced"]


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                               


INFO:scvi.data._anndata:Using data from adata.layers["raw_spliced"]


[34mINFO    [0m Successfully registered anndata object containing [1;36m2600[0m cells, [1;36m1252[0m vars, [1;36m1[0m batches, [1;36m1[0m labels, and [1;36m0[0m       
         proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra continuous covariates.               


INFO:scvi.data._anndata:Successfully registered anndata object containing 2600 cells, 1252 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates.


[34mINFO    [0m Please do not further modify adata until model is trained.                                                


INFO:scvi.data._anndata:Please do not further modify adata until model is trained.


-----------
auto
auto_t0_constraint
TraceEnum
step    0 loss = 4.36631 patience = 45
step  100 loss = 3.07904 patience = 45
step  200 loss = 2.55102 patience = 45
step  300 loss = 2.24966 patience = 45
step  400 loss = 2.03977 patience = 45
step  500 loss = 1.88323 patience = 44
step  600 loss = 1.76618 patience = 44
step  700 loss = 1.67175 patience = 45
step  800 loss = 1.60103 patience = 43
step  900 loss = 1.54598 patience = 43
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing velocity graph (using 1/64 cores)


  0%|          | 0/2600 [00:00<?, ?cells/s]

    finished (0:00:04) --> added 
    'velocity_pyro_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_pyro_umap', embedded velocity vectors (adata.obsm)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/64 cores)


  0%|          | 0/2600 [00:00<?, ?cells/s]

    finished (0:00:02) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
# Cross-Boundary Direction Correctness (A->B)
{('dividing', 'progenitors'): 0.8855158587699344, ('progenitors', 'activating'): 0.8400714529314554}
Total Mean: 0.8627936558506949
# In-cluster Coherence
{'B cell lineage': 0.78261846, 'activating': 0.6939748, 'dividing': 0.7032249, 'macrophages': 0.70824033, 'progenitors': 0.6754519}
Total Mean: 0.7127020955085754
# Cross-Boundary Direction Correctness (A->B)
{('dividing', 'progenitors'): 0.8855158587699344, ('progenitors', 'activating'): 0.8400714529314554}
Total Mean: 0.8627936558506949
# In-cluster Coherence
{'B cell lineage': 0.78261846, 'activating': 0.6939748, 'dividing': 0.7032249, 'macrophages': 0.70824033, 'progenitors': 0.6754519}
Total Mean: 0.7127020955085754
MouseErythroid
Filtered out 47456 genes tha

INFO:scvi.data._anndata:No batch_key inputted, assuming all cells are same batch


[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 


INFO:scvi.data._anndata:No label_key inputted, assuming all cells have same label


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             


INFO:scvi.data._anndata:Using data from adata.layers["raw_unspliced"]


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                               


INFO:scvi.data._anndata:Using data from adata.layers["raw_spliced"]


[34mINFO    [0m Successfully registered anndata object containing [1;36m9815[0m cells, [1;36m3000[0m vars, [1;36m1[0m batches, [1;36m1[0m labels, and [1;36m0[0m       
         proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra continuous covariates.               


INFO:scvi.data._anndata:Successfully registered anndata object containing 9815 cells, 3000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates.


[34mINFO    [0m Please do not further modify adata until model is trained.                                                


INFO:scvi.data._anndata:Please do not further modify adata until model is trained.


-----------
auto
auto_t0_constraint
step    0 loss = 7.15496e+08 patience = 45
step  100 loss = 4.69386e+08 patience = 45
step  200 loss = 4.22965e+08 patience = 44
step  300 loss = 4.00591e+08 patience = 44
step  400 loss = 3.89931e+08 patience = 45
step  500 loss = 3.84521e+08 patience = 35
step  600 loss = 3.82089e+08 patience = 45
step  700 loss = 3.81035e+08 patience = 43
step  800 loss = 3.80579e+08 patience = 45
step  900 loss = 3.80265e+08 patience = 41



KeyboardInterrupt



Error in callback <function _draw_all_if_interactive at 0x1490745f61f0> (for post_execute):
