In [1]:
import scvelo as scv
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import io
from scipy.sparse import coo_matrix, csr_matrix
import torch
from velovi import preprocess_data, VELOVI

## Anndata object construction

In [7]:
os.chdir('/home/eegorov/scripts/')

X = io.mmread("counts.mtx")

adata = anndata.AnnData(
    X=X.transpose().tocsr()
)

cell_meta = pd.read_csv("metadata.csv")
with open("gene_names.csv", 'r') as f:
    gene_names = f.read().splitlines()

adata.obs = cell_meta
adata.obs.index = adata.obs['Barcode']
adata.var.index = gene_names

pca = pd.read_csv("pca.csv")
pca.index = adata.obs.index

adata.obsm['X_pca'] = pca.to_numpy()
adata.obsm['X_umap'] = np.vstack((adata.obs['UMAP_1'].to_numpy(), adata.obs['UMAP_2'].to_numpy())).T

pca = pd.read_csv("pca.csv")
pca.index = adata.obs.index

adata.obsm['X_pca'] = pca.to_numpy()
adata.obsm['X_umap'] = np.vstack((adata.obs['UMAP_1'].to_numpy(), adata.obs['UMAP_2'].to_numpy())).T

patient_5 = scv.read('/home/eegorov/kasatskaya/DL004.loom')

#patient_5
barcodes = [bc.split(':')[1][0:len(bc.split(':')[1])-1] for bc in patient_5.obs.index.tolist()]
barcodes = [bc+'-1-kasatskaya_D05_rep1' for bc in barcodes]
patient_5.obs.index = barcodes
patient_5.var_names_make_unique()

index_to_save = adata.to_df().index
scv.utils.clean_obs_names(adata)
scv.utils.clean_obs_names(patient_5)
adata = scv.utils.merge(adata, patient_5)
adata.obs.index = index_to_save


## VeloVI usage and spliced-aware dataset construction

In [8]:
#Preprocessing and gene selecting for velocity estimation
scv.pp.filter_and_normalize(adata, min_shared_counts=1, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
adata = preprocess_data(adata)

#Training the model to extimate the velocity for particular gens of interest
VELOVI.setup_anndata(adata, spliced_layer="Ms", unspliced_layer="Mu")
vae = VELOVI(adata)
vae.train()

#Adding velocity to AnnData object
latent_time = vae.get_latent_time(n_samples=25)
velocities = vae.get_velocity(n_samples=25, velo_statistic="mean")
scaling = 20 / latent_time.max(0)
adata.layers["velocity"] = velocities / scaling

#Estimating velocity*expression metric for each selected genes
genes_filtered_scvelo = adata.var['velocity_genes'][adata.var['velocity_genes'] == True].index.tolist()
velocity = adata.to_df(layer='velocity')[genes_filtered_scvelo]
expression = adata.to_df()[genes_filtered_scvelo]
multiplicate = velocity*expression.values
multiplicate = multiplicate.fillna(0)

#Constructing the spliced-aware "count" matrix
column_names = multiplicate.columns.tolist()
spliced_genes = [x+'_spliced' for x in column_names]
unspliced_genes = [x+'_unspliced' for x in column_names]
cols_for_df = spliced_genes+unspliced_genes

dict_spliced = dict(zip(column_names,spliced_genes))
dict_unspliced = dict(zip(column_names,unspliced_genes))
df_spliced = pd.DataFrame(0,columns = spliced_genes, index = multiplicate.index)
df_unspliced = pd.DataFrame(0,columns = unspliced_genes, index = multiplicate.index)
summing_up = pd.concat([df_spliced,df_unspliced],axis=1)

#Filling in values
for i in dict_unspliced.keys():
    summing_up.loc[multiplicate[i]>0,dict_unspliced[i]] = multiplicate[i]
for i in dict_spliced.keys():
    summing_up.loc[multiplicate[i]<0,dict_spliced[i]] = multiplicate[i]*(-1)
    
#Saving
summing_up.to_csv('spliced_aware_count_matrix.csv',index=True)

Filtered out 11235 genes that are detected 1 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
computing neighbors
    finished (0:00:01) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 227/500:  45%|████▌     | 227/500 [03:15<03:54,  1.16it/s, v_num=1, train_loss_step=-418, train_loss_epoch=-2.04e+3]    
Monitored metric elbo_validation did not improve in the last 45 records. Best score: -1928.422. Signaling Trainer to stop.
