In [1]:
import scvi
def robust_optimization(mod, save_dir, max_epochs = 1000, lr = 0.01, **model_kwargs):
    n_modules = mod.module.model.n_modules
    adata = mod.adata
    print('First optimization run.')
    scvi.settings.seed = 9
    mod.train(use_gpu=True, max_epochs = max_epochs, lr = lr)
    history = mod.history
    mod.save(save_dir+'c2f_model', overwrite=True)
    mod.adata.write(save_dir+"c2f_model_anndata.h5ad")
    print('Second optimization run.')
    del mod
    c2f.Cell2fate_DynamicalModel.setup_anndata(adata, spliced_label='spliced', unspliced_label='unspliced')
    mod1 = c2f.Cell2fate_DynamicalModel(adata, n_modules = n_modules,
                                       **model_kwargs)
    scvi.settings.seed = 99
    mod1.train(use_gpu=True, max_epochs = max_epochs, lr = lr)
    history1 = mod1.history
    mod1.save(save_dir+'c2f_model1', overwrite=True)
    mod1.adata.write(save_dir+"c2f_model1_anndata.h5ad")
    del mod1
    print('Third optimization run.')
    c2f.Cell2fate_DynamicalModel.setup_anndata(adata, spliced_label='spliced', unspliced_label='unspliced')
    mod2 = c2f.Cell2fate_DynamicalModel(adata, n_modules = n_modules,
                                       **model_kwargs)
    scvi.settings.seed = 999
    mod2.train(use_gpu=True, max_epochs = max_epochs, lr = lr)
    history2 = mod2.history
    mod2.save(save_dir+'c2f_model2', overwrite=True)
    mod2.adata.write(save_dir+"c2f_model2_anndata.h5ad")
    
    best = np.argmin((np.mean(np.array(history['elbo_train'][-50:])),
                     np.mean(np.array(history1['elbo_train'][-50:])),
                     np.mean(np.array(history2['elbo_train'][-50:]))))
    
    if best == 2:
        return mod2
    elif best == 1:
        del mod2
        adata = sc.read_h5ad(save_dir+"c2f_model1_anndata.h5ad")
        mod1 = c2f.Cell2fate_DynamicalModel.load(save_dir+'c2f_model1', adata)
        return mod1
    elif best == 0:
        del mod2
        adata = sc.read_h5ad(save_dir+"c2f_model_anndata.h5ad")
        mod = c2f.Cell2fate_DynamicalModel.load(save_dir+'c2f_model', adata)
        return mod

Global seed set to 0


In [None]:
import os
os.chdir('..')
os.chdir('..')
import scvelo as scv
import scanpy as sc
import cell2fate as c2f
import pickle as pickle
from eval_utils import cross_boundary_correctness
from datetime import datetime
import pandas as pd
import numpy as np
from os.path import exists
import matplotlib.pyplot as plt
import torch
import unitvelo as utv
method = 'Cell2fateDynamicalModel_VeryRobustOptimization2'
data_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_datasets/'
save_dir = '/nfs/team283/aa16/data/fate_benchmarking/benchmarking_results/'
datasets = ['Pancreas_with_cc',  'DentateGyrus' , 'MouseBoneMarrow', 'MouseErythroid', 'HumanBoneMarrow']
n_genes_list = np.array((2000, 3000))
n_counts_list = np.array((10, 20))

for i in range(len(n_genes_list)):
    for j in range(len(n_counts_list)):
        for k in (0,1,2,3,4):
            print(i)
            print(j)
            print(k)
            dataset = datasets[k]
            n_genes = n_genes_list[i]
            min_counts = n_counts_list[j]
            model_index = str(i) + '-' + str(j) + '-' + str(k)
            save_name = method + '_'
            if exists(save_dir + save_name + '_CBDC_fullBenchmark.csv'):
                tab = pd.read_csv(save_dir + save_name + '_CBDC_fullBenchmark.csv', index_col = 0)
                if model_index in tab.index:
                    continue
            adata = sc.read_h5ad(data_dir + dataset + '/' + dataset + '_anndata.h5ad')
            adata = c2f.utils.get_training_data(adata, cells_per_cluster = 10**6, cluster_column = 'clusters',
                                            remove_clusters = [], min_shared_counts = min_counts, n_var_genes= n_genes)
            c2f.Cell2fate_DynamicalModel.setup_anndata(adata, spliced_label='spliced', unspliced_label='unspliced')    
            n_modules = c2f.utils.get_max_modules(adata)
            mod = c2f.Cell2fate_DynamicalModel(adata,
                                               n_modules = n_modules)   
            mod = robust_optimization(mod, save_dir = '/nfs/team283/aa16/data/cell2fate/ErythroidMaturation/')
            sample_kwarg = {"num_samples": 10, "batch_size" : adata.n_obs,
                 "use_gpu" : True, 'return_samples': False}
            adata = mod.export_posterior(adata, sample_kwargs = sample_kwarg)
            mod.compute_and_plot_total_velocity_scvelo(adata, save = False, delete = False)
            # Calculate performance metrics:
            file = open(data_dir + dataset + '/' + dataset + '_groundTruth.pickle' ,'rb')
            ground_truth = pickle.load(file)
            metrics = utv.evaluate(adata, ground_truth, 'clusters', 'velocity')
            cb_score = [np.mean(metrics['Cross-Boundary Direction Correctness (A->B)'][x])
                        for x in metrics['Cross-Boundary Direction Correctness (A->B)'].keys()]
            if exists(save_dir + save_name + '_CBDC_fullBenchmark.csv'):
                tab = pd.read_csv(save_dir + save_name + '_CBDC_fullBenchmark.csv', index_col = 0)
            else:
                c_names = ['CBDC']
                tab = pd.DataFrame(columns = c_names)
            tab.loc[model_index, 'CBDC'] = np.mean(cb_score)
            tab.to_csv(save_dir + save_name + '_CBDC_fullBenchmark.csv')
tab = pd.read_csv(save_dir + save_name + '_CBDC_fullBenchmark.csv', index_col = 0)
tab.loc['AVERAGE', 'CBDC'] = np.mean(tab['CBDC'])
tab.to_csv(save_dir + save_name + '_CBDC_fullBenchmark.csv')

(Running UniTVelo 0.2.5)
2023-10-31 16:25:15
0
0
0
Keeping at most 1000000 cells per cluster
Filtered out 19641 genes that are detected 10 counts (shared).
Extracted 2000 highly variable genes.
Leiden clustering ...


Global seed set to 9
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of Leiden Clusters: 13
Maximal Number of Modules: 14
First optimization run.
Epoch 1000/1000: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [17:36<00:00,  1.06s/it, v_num=1, elbo_train=6.87e+6]


Global seed set to 99
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Second optimization run.
Epoch 1000/1000: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [17:30<00:00,  1.05s/it, v_num=1, elbo_train=6.85e+6]


Global seed set to 999
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Third optimization run.
Epoch 1000/1000: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [17:32<00:00,  1.05s/it, v_num=1, elbo_train=6.83e+6]
Sampling local variables, batch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.60s/it]
Sampling global variables, sample: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.63it/s]
Computing total RNAvelocity ...


  0%|          | 0/3696 [00:00<?, ?cells/s]

# Cross-Boundary Direction Correctness (A->B)
{('Ngn3 high EP', 'Pre-endocrine'): -0.7452714060630122, ('Pre-endocrine', 'Alpha'): -0.3694149021101703, ('Pre-endocrine', 'Beta'): -0.8209475883750265, ('Pre-endocrine', 'Delta'): -0.5904017616914687, ('Pre-endocrine', 'Epsilon'): -0.4991425937808836}
Total Mean: -0.6050356504041123
# In-cluster Coherence
{'Alpha': 0.831604, 'Beta': 0.7181901, 'Delta': 0.6307401, 'Ductal': 0.7922202, 'Epsilon': 0.7515854, 'Ngn3 high EP': 0.78765243, 'Ngn3 low EP': 0.7949522, 'Pre-endocrine': 0.8712419}
Total Mean: 0.7722733020782471
0
0
1
Keeping at most 1000000 cells per cluster
Filtered out 9174 genes that are detected 10 counts (shared).
Extracted 2000 highly variable genes.
Leiden clustering ...
         Falling back to preprocessing with `sc.pp.pca` and default params.


Global seed set to 9
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of Leiden Clusters: 13
Maximal Number of Modules: 14
First optimization run.
Epoch 692/1000:  69%|███████████████████████████████████████████████████████████████████████▊                                | 691/1000 [09:42<04:17,  1.20it/s, v_num=1, elbo_train=3.83e+6]