In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pickle
from eval_utils import cross_boundary_correctness
import matplotlib.pyplot as plt
import pandas as pd
import pyrovelocity
import unitvelo as utv
from os.path import exists
from pyrovelocity.api import train_model
method = 'pyroVelocity_model2'

(Running UniTVelo 0.2.5.1)
2023-05-17 09:22:59


In [2]:
def compute_mean_vector_field(
    pos,
    adata,
    basis="umap",
    n_jobs=1,
    spliced="spliced_pyro",
    raw=False,
):
    scv.pp.neighbors(adata, use_rep="pca")

    adata.var["velocity_genes"] = True

    if spliced == "spliced_pyro":
        if raw:
            ut = pos["ut"]
            st = pos["st"]
            ut = ut / ut.sum(axis=-1, keepdims=True)
            st = st / st.sum(axis=-1, keepdims=True)
        else:
            ut = pos["ut"]
            st = pos["st"]
        adata.layers["spliced_pyro"] = st.mean(0).squeeze()
        # if ('u_scale' in pos) and ('s_scale' in pos): # TODO: two scale for Normal distribution
        if "u_scale" in pos:  # only one scale for Poisson distribution
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] / pos["u_scale"] - st * pos["gamma"]
            ).mean(0)
        else:
            if "beta_k" in pos:
                adata.layers["velocity_pyro"] = (
                    (ut * pos["beta_k"] - pos["st"] * pos["gamma_k"]).mean(0).squeeze()
                )
            else:
                adata.layers["velocity_pyro"] = (
                    ut * pos["beta"] - pos["st"] * pos["gamma"]
                ).mean(0)
        scv.tl.velocity_graph(
            adata, vkey="velocity_pyro", xkey="spliced_pyro", n_jobs=n_jobs
        )
    elif spliced in ["Ms"]:
        ut = adata.layers["Mu"]
        st = adata.layers["Ms"]
        if ("u_scale" in pos) and ("s_scale" in pos):
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] / (pos["u_scale"] / pos["s_scale"]) - st * pos["gamma"]
            ).mean(0)
        else:
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] - pos["st"] * pos["gamma"]
            ).mean(0)
        scv.tl.velocity_graph(adata, vkey="velocity_pyro", xkey="Ms", n_jobs=n_jobs)
    elif spliced in ["spliced"]:
        ut = adata.layers["unspliced"]
        st = adata.layers["spliced"]
        if ("u_scale" in pos) and ("s_scale" in pos):
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] / (pos["u_scale"] / pos["s_scale"]) - st * pos["gamma"]
            ).mean(0)
        else:
            adata.layers["velocity_pyro"] = (
                ut * pos["beta"] - pos["st"] * pos["gamma"]
            ).mean(0)
        scv.tl.velocity_graph(
            adata, vkey="velocity_pyro", xkey="spliced", n_jobs=n_jobs
        )

    scv.tl.velocity_embedding(adata, vkey="velocity_pyro", basis=basis)

In [3]:
datasets = ['Pancreas_with_cc', 'HumanDevelopingBrain', 'DentateGyrus' , 'MouseBoneMarrow', 'MouseErythroid', 'HumanBoneMarrow']
data_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_datasets/'
save_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_results/'

In [None]:
for dataset in datasets:
    print(dataset)
    adata = sc.read_h5ad(data_dir + dataset + '/' + dataset + '_anndata.h5ad')
    adata.layers['raw_spliced']   = adata.layers['spliced']
    adata.layers['raw_unspliced'] = adata.layers['unspliced']
    scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=3000)
    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    adata.obs['u_lib_size_raw'] = adata.layers['raw_unspliced'].toarray().sum(-1)
    adata.obs['s_lib_size_raw'] = adata.layers['raw_spliced'].toarray().sum(-1)
    # Model 1
    num_epochs = 1000 # large data
    adata_model_pos = train_model(adata,
                                   max_epochs=num_epochs, svi_train=True, log_every=100,
                                   patient_init=45,
                                   batch_size=4000, use_gpu=0, cell_state='state_info',
                                   include_prior=True,
                                   offset=True,
                                   library_size=True,
                                   patient_improve=1e-3,
                                   model_type='auto',
                                   guide_type='auto',
                                   train_size=1.0)
    compute_mean_vector_field(adata_model_pos[1], adata)
    scv.tl.velocity_graph(adata, vkey = 'velocity')
    scv.tl.velocity_embedding(adata, vkey = 'velocity')
    fix, ax = plt.subplots(1, 1, figsize = (8, 6))
    scv.pl.velocity_embedding_stream(adata, basis='umap', save = False, vkey='velocity',
                                     show = False, ax = ax)
    plt.savefig(save_dir + 'UMAPs/' + dataset + '_UMAP_pyroVelocity_model2.svg')
    # Calculate performance metrics:
    file = open(data_dir + dataset + '/' + dataset + '_groundTruth.pickle' ,'rb')
    ground_truth = pickle.load(file)
    metrics = utv.evaluate(adata, ground_truth, 'clusters', 'velocity')
    if exists(save_dir + dataset + '_CBDC_scores.csv'):
        tab = pd.read_csv(save_dir + dataset + '_CBDC_scores.csv', index_col = 0)
    else:
        tab = pd.DataFrame(columns = list(metrics['Cross-Boundary Direction Correctness (A->B)'].keys()) + ['Mean'],
                 index = [method])
    cb_score = [np.mean(metrics['Cross-Boundary Direction Correctness (A->B)'][x])
                for x in metrics['Cross-Boundary Direction Correctness (A->B)'].keys()]
    tab.loc[method,:] = cb_score + [np.mean(cb_score)]
    tab.to_csv(save_dir + dataset + '_CBDC_scores.csv')
    metrics = utv.evaluate(adata, ground_truth, 'clusters', 'velocity')
    if exists(save_dir + dataset + '_ICC_scores.csv'):
        tab = pd.read_csv(save_dir + dataset + '_ICC_scores.csv', index_col = 0)
    else:
        tab = pd.DataFrame(columns = list(np.unique(np.concatenate(ground_truth))) + ['Mean'],
                 index = [method])
    icc_score = [np.mean(metrics['In-cluster Coherence'][x]) for x in np.unique(np.concatenate(ground_truth))]
    tab.loc[method,:] = icc_score + [np.mean(icc_score)]
    tab.to_csv(save_dir + dataset + '_ICC_scores.csv')
    fix, ax = plt.subplots(1, 1, figsize = (8, 6))
    scv.pl.velocity_embedding_stream(adata, basis='umap', save = False, vkey='velocity',
                                     show = False, ax = ax)
    plt.savefig(save_dir + 'UMAPs/' + dataset + '_UMAP_' + method + '.svg')

Pancreas_with_cc
Filtered out 20801 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 3000 highly variable genes.
Logarithmized X.
computing neighbors
    finished (0:00:05) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                                                  
[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                         

-----------
auto
auto
TraceEnum
step    0 loss = 4.81905 patience = 45
step  100 loss = 3.59446 patience = 45
step  200 loss = 3.13795 patience = 45
step  300 loss = 2.80305 patience = 44
step  400 loss = 2.54211 patience = 45
step  500 loss = 2.33755 patience = 44
step  600 loss = 2.17655 patience = 41
step  700 loss = 2.05078 patience = 39
step  800 loss = 1.9542 patience = 44
step  900 loss = 1.87587 patience = 34
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing velocity graph (using 1/64 cores)


  0%|          | 0/3696 [00:00<?, ?cells/s]

    finished (0:00:22) --> added 
    'velocity_pyro_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_pyro_umap', embedded velocity vectors (adata.obsm)
computing velocities
    finished (0:00:01) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/64 cores)


  0%|          | 0/3696 [00:00<?, ?cells/s]

    finished (0:00:09) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
# Cross-Boundary Direction Correctness (A->B)
{('Ngn3 high EP', 'Pre-endocrine'): 0.757007429838491, ('Pre-endocrine', 'Alpha'): 0.7275974142910995, ('Pre-endocrine', 'Beta'): 0.8027938271018766, ('Pre-endocrine', 'Delta'): 0.5254597874024217, ('Pre-endocrine', 'Epsilon'): 0.06583665122574334}
Total Mean: 0.5757390219719264
# In-cluster Coherence
{'Alpha': 0.74785024, 'Beta': 0.7475013, 'Delta': 0.8006084, 'Ductal': 0.97006565, 'Epsilon': 0.8516652, 'Ngn3 high EP': 0.954702, 'Ngn3 low EP': 0.9747722, 'Pre-endocrine': 0.8249896}
Total Mean: 0.8590192794799805
# Cross-Boundary Direction Correctness (A->B)
{('Ngn3 high EP', 'Pre-endocrine'): 0.757007429838491, ('Pre-endocrine', 'Alpha'): 0.7275974142910995, ('Pre-endocrine', 'Beta'): 0.8027938271018766, ('Pre-endo

INFO:scvi.data._anndata:No batch_key inputted, assuming all cells are same batch


[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 


INFO:scvi.data._anndata:No label_key inputted, assuming all cells have same label


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             


INFO:scvi.data._anndata:Using data from adata.layers["raw_unspliced"]


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                               


INFO:scvi.data._anndata:Using data from adata.layers["raw_spliced"]


[34mINFO    [0m Successfully registered anndata object containing [1;36m9443[0m cells, [1;36m3000[0m vars, [1;36m1[0m batches, [1;36m1[0m labels, and [1;36m0[0m       
         proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra continuous covariates.               


INFO:scvi.data._anndata:Successfully registered anndata object containing 9443 cells, 3000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates.


[34mINFO    [0m Please do not further modify adata until model is trained.                                                


INFO:scvi.data._anndata:Please do not further modify adata until model is trained.


-----------
auto
auto
step    0 loss = 2.06738e+08 patience = 45
step  100 loss = 1.42553e+08 patience = 45
step  200 loss = 1.25188e+08 patience = 42
step  300 loss = 1.16894e+08 patience = 17
step  400 loss = 1.12789e+08 patience = 30
computing neighbors
    finished (0:00:02) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing velocity graph (using 1/64 cores)


  0%|          | 0/9443 [00:00<?, ?cells/s]

    finished (0:01:11) --> added 
    'velocity_pyro_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_pyro_umap', embedded velocity vectors (adata.obsm)
computing velocities
    finished (0:00:02) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/64 cores)


  0%|          | 0/9443 [00:00<?, ?cells/s]

    finished (0:00:11) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
# Cross-Boundary Direction Correctness (A->B)
{('Radial Glia', 'Intermediate \n Progenitor'): -0.2045504988325818, ('Intermediate \n Progenitor', 'Newborn \n Excitatory Neuron'): -0.1543549649243018, ('Newborn \n Excitatory Neuron', 'Immature \n Excitatory Neuron'): 0.47560031566025945, ('Immature \n Excitatory Neuron', 'Mature \n Excitatory Neuron'): 0.8203731205800825}
Total Mean: 0.23426699312086458
# In-cluster Coherence
{'Immature \n Excitatory Neuron': 0.7528147, 'Intermediate \n Progenitor': 0.8138593, 'Mature \n Excitatory Neuron': 0.76348907, 'Newborn \n Excitatory Neuron': 0.8205157, 'Radial Glia': 0.83167267}
Total Mean: 0.7964702844619751
# Cross-Boundary Direction Correctness (A->B)
{('Radial Glia', 'Intermediate \n Progenitor'): -0.204550498832581

INFO:scvi.data._anndata:No batch_key inputted, assuming all cells are same batch


[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 


INFO:scvi.data._anndata:No label_key inputted, assuming all cells have same label


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_unspliced"[0m[1m][0m                                                             


INFO:scvi.data._anndata:Using data from adata.layers["raw_unspliced"]


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_spliced"[0m[1m][0m                                                               


INFO:scvi.data._anndata:Using data from adata.layers["raw_spliced"]


[34mINFO    [0m Successfully registered anndata object containing [1;36m2930[0m cells, [1;36m3000[0m vars, [1;36m1[0m batches, [1;36m1[0m labels, and [1;36m0[0m       
         proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra continuous covariates.               


INFO:scvi.data._anndata:Successfully registered anndata object containing 2930 cells, 3000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates.


[34mINFO    [0m Please do not further modify adata until model is trained.                                                


INFO:scvi.data._anndata:Please do not further modify adata until model is trained.


-----------
auto
auto
TraceEnum
step    0 loss = 2.33752 patience = 45
step  100 loss = 1.8051 patience = 45
step  200 loss = 1.58678 patience = 45
step  300 loss = 1.42085 patience = 44
step  400 loss = 1.29134 patience = 45
step  500 loss = 1.1887 patience = 44
step  600 loss = 1.10866 patience = 45
step  700 loss = 1.04631 patience = 38
step  800 loss = 0.998181 patience = 44
step  900 loss = 0.960345 patience = 37
computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing velocity graph (using 1/64 cores)


  0%|          | 0/2930 [00:00<?, ?cells/s]

    finished (0:00:19) --> added 
    'velocity_pyro_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_pyro_umap', embedded velocity vectors (adata.obsm)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/64 cores)


  0%|          | 0/2930 [00:00<?, ?cells/s]