In [None]:
import matplotlib
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import scvelo as scv
import scipy
import json
import os

with open('../../configs/config.json') as f:
    input_paths = json.load(f)
top_dir = input_paths['top_dir']
kbdir = os.path.sep.join([top_dir, "results", "kb", "mouse_pancreas", "kb_out", "counts_unfiltered"])
e2n_path = os.path.sep.join([top_dir, "refs", "refdata-cellranger-mm10-2.1.0", "geneid_to_name.txt"])
os.makedirs("anndata", exist_ok= True)

## Read in rownames and colnames of both spliced mtarix and unspliced matrix

In [None]:
e2n = dict([ l.rstrip().split() for l in open(e2n_path).readlines()])
spliced_var_names = [ l.rstrip() for l in open(os.path.sep.join([kbdir, "spliced.genes.txt"])).readlines()]
spliced_var_names = [e2n[e.split(".")[0]] for e in spliced_var_names]
unspliced_var_names = [ l.rstrip() for l in open(os.path.sep.join([kbdir, "unspliced.genes.txt"])).readlines()]
unspliced_var_names = [e2n[e.split(".")[0]] for e in unspliced_var_names]
spliced_obs_names = [ l.rstrip() for l in open(os.path.sep.join([kbdir, "spliced.barcodes.txt"])).readlines() ]
unspliced_obs_names = [ l.rstrip() for l in open(os.path.sep.join([kbdir, "unspliced.barcodes.txt"])).readlines() ]

## Read in count matrices and intersect with example dataset

In [None]:
example_adata = scv.datasets.pancreas()
spliced = sc.read_mtx(os.path.sep.join([kbdir, "spliced.mtx"]))
spliced.var_names = spliced_var_names
spliced.obs_names = spliced_obs_names
spliced.var_names_make_unique()
spliced = spliced[example_adata.obs_names, example_adata.var_names]

unspliced = sc.read_mtx(os.path.sep.join([kbdir, "unspliced.mtx"]))
unspliced.var_names = unspliced_var_names
unspliced.obs_names = unspliced_obs_names
unspliced.var_names_make_unique()
unspliced = unspliced[example_adata.obs_names, example_adata.var_names]

## Define adata and write it

In [None]:
adata = spliced
adata.layers["spliced"] = adata.X
adata.layers["unspliced"] = unspliced.X
subset_adata.obs = example_adata.obs
subset_adata.obsm['X_umap'] = example_adata.obsm['X_umap']
subset_adata.write('anndata/pancreas_kb_trimmed.h5ad', compression='gzip')


## Get some statistic

In [None]:
spliced = subset_adata.layers['spliced']
unspliced = subset_adata.layers['unspliced']

In [None]:
spliced.sum().sum() / (spliced.sum().sum()+unspliced.sum().sum())

In [None]:
unspliced.sum().sum() / (spliced.sum().sum()+unspliced.sum().sum())

## Run scVelo

In [None]:
adata = scv.read("anndata/pancreas_kb_trimmed.h5ad")


In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_kb.pdf")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_kb.pdf")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_kb.png")
