In [None]:
import matplotlib
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import scvelo as scv
import scipy
import json
import os
example_adata = scv.datasets.pancreas()


with open('../../configs/config.json') as f:
    input_paths = json.load(f)
stardir = os.path.sep.join([top_dir, "results", "star_solo", "mouse_pancreas", "star_solo", "Solo.out", "Velocyto", "filtered"])
os.makedirs("anndata", exist_ok= True)
var_names = [ l.rstrip().split()[1] for l in open(os.path.sep.join([stardir, "features.tsv"])).readlines()]
obs_names = [ l.rstrip() for l in open(os.path.sep.join([stardir, "barcodes.tsv"])).readlines() ]

## Read in the count matrices

In [None]:
spliced = sc.read_mtx(os.path.sep.join([stardir, "spliced.mtx"])).T
spliced.var_names = var_names
spliced.obs_names = obs_names
spliced.var_names_make_unique()
spliced = spliced[example_adata.obs_names, example_adata.var_names]

unspliced = sc.read_mtx(os.path.sep.join([stardir, "unspliced.mtx"])).T
unspliced.var_names = var_names
unspliced.obs_names = obs_names
unspliced.var_names_make_unique()
unspliced = unspliced[example_adata.obs_names, example_adata.var_names]

ambiguous = sc.read_mtx(os.path.sep.join([stardir, "ambiguous.mtx"])).T
ambiguous.var_names = var_names
ambiguous.obs_names = obs_names
ambiguous.var_names_make_unique()
ambiguous = ambiguous[example_adata.obs_names, example_adata.var_names]



spliced = pd.DataFrame.sparse.from_spmatrix(spliced.X,columns=spliced.var_names, index=spliced.obs_names).sparse.to_dense()
unspliced = pd.DataFrame.sparse.from_spmatrix(unspliced.X,columns=unspliced.var_names, index=unspliced.obs_names).sparse.to_dense()
ambiguous = pd.DataFrame.sparse.from_spmatrix(ambiguous.X,columns=ambiguous.var_names, index=ambiguous.obs_names).sparse.to_dense()



In [None]:
spliced.sum().sum() / (spliced.sum().sum()+unspliced.sum().sum()+ambiguous.sum().sum())

In [None]:
unspliced.sum().sum() / (spliced.sum().sum()+unspliced.sum().sum()+ambiguous.sum().sum())

In [None]:
ambiguous.sum().sum() / (spliced.sum().sum()+unspliced.sum().sum()+ambiguous.sum().sum())

## A discard

In [None]:
# create AnnData using spliced and unspliced count matrix
adata = anndata.AnnData(X = spliced, 
                        layers = dict(spliced = spliced, 
                                    unspliced = unspliced))

adata.var_names_make_unique()


adata.obs = example_adata.obs
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

adata.write('anndata/pancreas_star_trimmed_A_discard.h5ad', compression='gzip')
scv.utils.show_proportions(adata)

del(adata)

## A to S

In [None]:
new_spliced = spliced + ambiguous

adata = anndata.AnnData(X = new_spliced, 
                        layers = dict(spliced = new_spliced, 
                                    unspliced = unspliced))
adata.var_names_make_unique()
adata.obs = example_adata.obs
adata.obsm['X_umap'] = example_adata.obsm['X_umap']
adata.write('anndata/pancreas_star_trimmed_A_S.h5ad', compression='gzip')
scv.utils.show_proportions(adata)

del(new_spliced, adata)


## A to U

In [None]:
new_unspliced = unspliced + ambiguous

adata = anndata.AnnData(X = spliced, 
                        layers = dict(spliced = spliced, 
                                    unspliced = new_unspliced))
adata.obs = example_adata.obs
adata.obsm['X_umap'] = example_adata.obsm['X_umap']
adata.write('anndata/pancreas_star_trimmed_A_U.h5ad', compression='gzip')
scv.utils.show_proportions(adata)

del(new_unspliced, adata)


## A to S:U

In [None]:
s_ratio = spliced/(spliced+unspliced)
s_ratio = s_ratio.fillna(0.5)
new_spliced = spliced + s_ratio * ambiguous
new_unspliced = unspliced + (1-s_ratio)* ambiguous

adata = anndata.AnnData(X = new_spliced, 
                        layers = dict(spliced = new_spliced, 
                                    unspliced = new_unspliced))
adata.obs = example_adata.obs
adata.write('anndata/pancreas_star_trimmed_A_S2U.h5ad', compression='gzip')
scv.utils.show_proportions(adata)

del(s_ratio, new_spliced, new_unspliced, adata)


## A to S+A:U 

In [None]:
s_ratio = (spliced+ambiguous)/(spliced+ambiguous+unspliced)
s_ratio = s_ratio.fillna(0.5)
new_spliced = spliced + s_ratio * ambiguous
new_unspliced = unspliced + (1-s_ratio)* ambiguous

adata = anndata.AnnData(X = new_spliced, 
                        layers = dict(spliced = new_spliced, 
                                    unspliced = new_unspliced))
adata.obs = example_adata.obs
adata.obsm['X_umap'] = example_adata.obsm['X_umap']
adata.write('anndata/pancreas_star_trimmed_A_S+A2U.h5ad', compression='gzip')
scv.utils.show_proportions(adata)

del(s_ratio, new_spliced, new_unspliced, adata)


## A to S:U+A 

In [None]:
s_ratio = (spliced)/(spliced+ambiguous+unspliced)
s_ratio = s_ratio.fillna(0.5)
new_spliced = spliced + s_ratio * ambiguous
new_unspliced = unspliced + (1-s_ratio)* ambiguous

adata = anndata.AnnData(X = new_spliced, 
                        layers = dict(spliced = new_spliced, 
                                    unspliced = new_unspliced))
adata.obs = example_adata.obs
adata.obsm['X_umap'] = example_adata.obsm['X_umap']
adata.write('anndata/pancreas_star_trimmed_A_S2U+A.h5ad', compression='gzip')
scv.utils.show_proportions(adata)

del(s_ratio, new_spliced, new_unspliced, adata)


## A uniform

In [None]:
s_ratio = 0.5
new_spliced = spliced + s_ratio * ambiguous
new_unspliced = unspliced + (1-s_ratio)* ambiguous

adata = anndata.AnnData(X = new_spliced, 
                        layers = dict(spliced = new_spliced, 
                                    unspliced = new_unspliced))
adata.var_names_make_unique()
adata.obs = example_adata.obs
adata.obsm['X_umap'] = example_adata.obsm['X_umap']
adata.write('anndata/pancreas_star_trimmed_A_unif.h5ad', compression='gzip')
scv.utils.show_proportions(adata)

del(s_ratio, new_spliced, new_unspliced, adata)


# The following code run on PC

In [None]:
import matplotlib
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import scvelo as scv
import scipy
import json
import os
example_adata = scv.datasets.pancreas()


In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_discard.h5ad")
adata.obs_names

## discard A


In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_discard.h5ad")
# get embeddings
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# sc.tl.umap(adata, n_components = 2)
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_star_A_discard.png")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_star_A_discard.png")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_star_A_discard.png")


## A to S

In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_S.h5ad")
# get embeddings
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# sc.tl.umap(adata, n_components = 2)
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_star_A_S.png")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_star_A_S.png")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_star_A_S.png")


## A to U

In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_U.h5ad")
# get embeddings
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# sc.tl.umap(adata, n_components = 2)
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_star_A_U.png")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_star_A_U.png")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_star_A_U.png")


## A to S:U


In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_S2U.h5ad")
# get embeddings
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# sc.tl.umap(adata, n_components = 2)
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_star_A_S2U.png")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_star_A_S2U.png")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_star_A_S2U.png")


## A to S+A:U


In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_S+A2U.h5ad")
# get embeddings
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# sc.tl.umap(adata, n_components = 2)
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_star_A_S+A2U.png")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_star_A_S+A2U.png")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_star_A_S+A2U.png")


## A to S:U+A


In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_S2U+A.h5ad")
# get embeddings
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# sc.tl.umap(adata, n_components = 2)
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_star_A_S2U+A.png")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_star_A_S2U+A.png")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_star_A_S2U+A.png")


## A to uniform


In [None]:
adata = scv.read("anndata/pancreas_star_trimmed_A_unif.h5ad")
# get embeddings
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.tsne(adata)
# sc.tl.umap(adata, n_components = 2)
adata.obsm['X_umap'] = example_adata.obsm['X_umap']

# housekeeping
matplotlib.use('AGG')
scv.settings.set_figure_params('scvelo')

# get the proportion of spliced and unspliced count
scv.utils.show_proportions(adata)

# filter cells and genes, then normalize expression values
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000,enforce=True)

# scVelo pipeline
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.recover_dynamics(adata, n_jobs = 11)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', save="umap_pancreas_star_A_unif.png")
scv.pl.velocity_embedding_stream(adata, basis='tsne', save="tsne_pancreas_star_A_unif.png")
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80, save = "latent_time_pancreas_star_A_unif.png")
