In [1]:
import anndata
import numpy as np
import scvelo as scv
import scanpy as sc
import sys
import torch
import os.path
import celldancer as cd
import pickle5 as pickle
import matplotlib.pyplot as plt
import pandas as pd
import unitvelo as utv
from os.path import exists
import celldancer.cdplt as cdplt
from celldancer.cdplt import colormap
method = 'CellDancer'
import time
from celldancer.utilities import export_velocity_to_dynamo
import celldancer.utilities as cdutil

(Running UniTVelo 0.2.5.2)
2024-02-27 14:57:05


In [2]:
datasets = ['MouseErythroid', 'Pancreas_with_cc', 'DentateGyrus' , 'MouseBoneMarrow', 'HumanBoneMarrow', 'HumanDevelopingBrain']
data_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_datasets/'
save_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_results_revision/'

In [None]:
for i in range(0, len(datasets)):
    # for dataset in datasets:
    dataset = datasets[i]
    print(dataset)
    adata = sc.read_h5ad(data_dir + dataset + '/' + dataset + '_anndata.h5ad')
    start = time.time()
    scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=3000)
    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    input_data = cdutil.adata_to_df_with_embed(adata,
                                  us_para=['Mu','Ms'],
                                  cell_type_para='clusters',
                                  embed_para='X_umap',
                                  save_path='cell_type_u_s.csv')
    loss_df, cellDancer_df=cd.velocity(input_data,
                                       gene_list= np.array(input_data['gene_name']),
                                       permutation_ratio=0.125,
                                       n_jobs=1)
    # compute cell velocity
    cellDancer_df=cd.compute_cell_velocity(cellDancer_df=cellDancer_df, projection_neighbor_choice='gene', 
                                           expression_scale='power10', projection_neighbor_size=10, speed_up=(100,100))
    adata = export_velocity_to_dynamo(cellDancer_df,adata)
    end = time.time()
    adata.layers['velocity'] = np.array(adata.layers['velocity_S'].todense())
    del adata.layers['velocity_S']
    scv.pp.neighbors(adata)
    scv.tl.velocity_graph(adata, vkey = 'velocity')
    scv.tl.velocity_embedding(adata, vkey = 'velocity')
    fix, ax = plt.subplots(1, 1, figsize = (8, 6))
    scv.pl.velocity_embedding_stream(adata, basis='umap', save = False, vkey='velocity',
                                     show = False, ax = ax)
    plt.savefig(save_dir + 'UMAPs/' + dataset + '_UMAP_CellDancer.svg')
    # Calculate performance metrics:
    file = open(data_dir + dataset + '/' + dataset + '_groundTruth.pickle' ,'rb')
    ground_truth = pickle.load(file)
    metrics = utv.evaluate(adata, ground_truth, 'clusters', 'velocity')
    if exists(save_dir + dataset + '_CBDC_scores.csv'):
        tab = pd.read_csv(save_dir + dataset + '_CBDC_scores.csv', index_col = 0)
    else:
        tab = pd.DataFrame(columns = list(metrics['Cross-Boundary Direction Correctness (A->B)'].keys()) + ['Mean', 'Time'],
                 index = [method])
    cb_score = [np.mean(metrics['Cross-Boundary Direction Correctness (A->B)'][x])
                for x in metrics['Cross-Boundary Direction Correctness (A->B)'].keys()]
    tab.loc[method,:] = cb_score + [np.mean(cb_score), end-start]
    tab.to_csv(save_dir + dataset + '_CBDC_scores.csv')
    fix, ax = plt.subplots(1, 1, figsize = (8, 6))
    scv.pl.velocity_embedding_stream(adata, basis='umap', save = False, vkey='velocity',
                                     show = False, ax = ax)
    plt.savefig(save_dir + 'UMAPs/' + dataset + '_UMAP_' + method + '.svg')
    adata.write_h5ad('/nfs/team283/aa16/data/fate_benchmarking/' + method + dataset + 'AnnDataForCellRank.h5ad')

MouseErythroid
Filtered out 47456 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 3000 highly variable genes.
Logarithmized X.
computing neighbors
