In [None]:
# Import dependencies
import os
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
import seaborn as sns

import matplotlib.pyplot as plt

# Initialize random seed
import random
random.seed(111)

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
#wdir = "/ceph/project/tendonhca/akurjan/analysis/"
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks/"
os.chdir( wdir )

# folder structures
NORMALIZATION_FOLDERNAME = "foetal/results/Normalisation/"
HARMONY_FOLDERNAME = "foetal/results/Harmony/"
RESULTS_FOLDERNAME = "foetal/results/Velocity/"
FIGURES_FOLDERNAME = "foetal/figures/Velocity/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME
scv.settings.figdir = FIGURES_FOLDERNAME
    
def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.tight_layout()
    fig.savefig(os.path.join(folder, fname), format='svg')

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
scv.set_figure_params('scvelo')

In [None]:
adata = scv.read(os.path.join(HARMONY_FOLDERNAME, 'allages_tendonfibro2.h5ad'), cache=False)
adata

In [None]:
adata.obs['C_scANVI'] = adata.obs['C_scANVI_orig']

In [None]:
adata.X = adata.layers['counts'].copy()

In [None]:
sc.pp.filter_genes(adata, min_counts=20, inplace=True)

In [None]:
fulldata = sc.read_h5ad(os.path.join(NORMALIZATION_FOLDERNAME, 'dev_adata_normalized.h5ad'))
fulldata

In [None]:
del fulldata.layers['counts'], fulldata.layers['log1p_norm'], fulldata.layers['scaled'], fulldata.obsm, fulldata.uns

In [None]:
fulldata.var_names_make_unique()

In [None]:
adata_genes = adata.var_names
adata_cells = adata.obs_names
# Filter the fulldata object to retain only the matching genes
fulldata = fulldata[adata_cells, adata_genes].copy()
fulldata

In [None]:
print(all(fulldata.var_names == adata.var_names))
print(np.array_equal(fulldata.var_names, adata.var_names))

In [None]:
print(all(fulldata.obs_names == adata.obs_names))
print(np.array_equal(fulldata.obs_names, adata.obs_names))

In [None]:
adata.layers['spliced'] = fulldata.layers['spliced'].copy()
adata.layers['unspliced'] = fulldata.layers['unspliced'].copy()

In [None]:
print(adata.X[1:5, 1:5])
print(adata.layers['spliced'][1:5, 1:5])
print(adata.layers['unspliced'][1:5, 1:5])

In [None]:
del fulldata

In [None]:
scv.pl.proportions(adata, groupby='sampletype')

In [None]:
print(adata.X.dtype)
print(adata.layers['spliced'].dtype)
print(adata.layers['unspliced'].dtype)

In [None]:
adata.layers['unspliced'] = adata.layers['unspliced'].astype('float32')
adata.layers['spliced'] = adata.layers['spliced'].astype('float32')

In [None]:
print(adata.X.dtype)
print(adata.layers['spliced'].dtype)
print(adata.layers['unspliced'].dtype)

In [None]:
#adata.X = adata.layers['counts'].copy()
#print(adata.X[1:5, 1:5])
#print(adata.layers['spliced'][1:5, 1:5])
#print(adata.layers['unspliced'][1:5, 1:5])

In [None]:
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000, 
                            log=True, subset_highly_variable=True)

In [None]:
adata

In [None]:
sc.pp.neighbors(adata,n_neighbors=80,use_rep="X_diff")
scv.pp.moments(adata,n_pcs=None,n_neighbors=None)

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_prepped2.h5ad'))

# RNA Velocity

## Dynamic Modelling

In [None]:
%%time
scv.tl.recover_dynamics(adata, n_jobs=None)

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_velocity2.h5ad'))

In [None]:
adata = scv.read(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_velocity2.h5ad'), cache=False)
adata

In [None]:
scv.tl.velocity(adata, mode='dynamical')
scv.tl.velocity_graph(adata, n_jobs=None)

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_velocity2.h5ad'))

From https://scvelo.readthedocs.io/VelocityBasics/:

"The black line corresponds to the estimated ‘steady-state’ ratio, i.e. the ratio of unspliced to spliced mRNA abundance which is in a constant transcriptional state. RNA velocity for a particular gene is determined as the residual, i.e. how much an observation deviates from that steady-state line. Positive velocity indicates that a gene is up-regulated, which occurs for cells that show higher abundance of unspliced mRNA for that gene than expected in steady state. Conversely, negative velocity indicates that a gene is down-regulated."

## Differential Kinetic Test
Distinct cell types and lineages may exhibit different kinetics regimes as these can be governed by a different network structure. Even if cell types or lineages are related, kinetics can be differential due to alternative splicing, alternative polyadenylation and modulations in degradation.

The dynamical model allows us to address this issue with a likelihood ratio test for differential kinetics to detect clusters/lineages that display kinetic behavior that cannot be sufficiently explained by a single model for the overall dynamics. Each cell type is tested whether an independent fit yields a significantly improved likelihood.

The likelihood ratio, following an asymptotic chi-squared distribution, can be tested for significance. Note that for efficiency reasons, by default an orthogonal regression is used instead of a full phase trajectory to test whether a cluster is well explained by the overall kinetic or exhibits a different kinetic.

In [None]:
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:100]
scv.tl.differential_kinetic_test(adata, var_names=top_genes, groupby='C_scANVI')

In [None]:
kwargs = dict(linewidth=2, add_linfit=True, frameon=False)
scv.pl.scatter(adata, basis=top_genes[:15], ncols=5, add_outline='fit_diff_kinetics', **kwargs)

In [None]:
scv.pl.scatter(adata, basis=top_genes[15:30], ncols=5, add_outline='fit_diff_kinetics', **kwargs)

In [None]:
# Recomputing velocity
scv.tl.velocity(adata, mode='dynamical', diff_kinetics=True)
scv.tl.velocity_graph(adata, n_jobs=None)

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_velocity2.h5ad'))

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique()

In [None]:
scv.tl.rank_velocity_genes(adata, groupby='C_scANVI')

s_genes, g2m_genes = scv.utils.get_phase_marker_genes(adata)
s_genes = scv.get_df(adata[:, s_genes], 'spearmans_score', sort_values=True).index
g2m_genes = scv.get_df(adata[:, g2m_genes], 'spearmans_score', sort_values=True).index

kwargs = dict(frameon=False, ylabel='cell cycle genes')
scv.pl.scatter(adata, list(s_genes[:3]) + list(g2m_genes[:3]), **kwargs)

In [None]:
adata.obsm

In [None]:
scv.pl.velocity_embedding_stream(adata, 
                                 basis='tsne', 
                                 color='C_scANVI',
                                 size=5, density=2.5,
                                 min_mass=0, arrow_size=0.8,
                                 legend_loc='none', legend_fontsize=6, fontsize=9,
                                 save='allages_tendonfibro_streamMap_tsne2.svg',
                                 figsize=(5,5),
                                 dpi=300)

In [None]:
scv.pl.velocity_embedding_stream(adata, 
                                 basis='tsne', 
                                 color='age',
                                 size=5, density=2.5,
                                 min_mass=0, arrow_size=0.8,
                                 legend_loc='none', legend_fontsize=6, fontsize=9,
                                 save='allages_tendonfibro_streamMap_age_tsne2.svg',
                                 figsize=(5,5),
                                 dpi=300)

In [None]:
scv.pl.velocity_embedding_stream(adata, 
                                 basis='umap_orig', 
                                 color='C_scANVI',
                                 size=5, density=2.5,
                                 min_mass=0, arrow_size=0.8,
                                 legend_loc='none', legend_fontsize=6, fontsize=9,
                                 save='allages_tendonfibro_streamMap_umap4.svg',
                                 figsize=(5,5),
                                 dpi=300)

In [None]:
scv.pl.velocity_embedding_stream(adata, 
                                 basis='umap_orig', 
                                 color='age',
                                 size=5, density=2.5,
                                 min_mass=0, arrow_size=0.8,
                                 legend_loc='none', legend_fontsize=6, fontsize=9,
                                 save='allages_tendonfibro_streamMap_age_umap4.svg',
                                 figsize=(5,5),
                                 dpi=300)

In [None]:
scv.pl.velocity_embedding_stream(adata, 
                                 basis='umap', 
                                 color='C_scANVI',
                                 size=5, density=2.5,
                                 min_mass=0, arrow_size=0.8,
                                 legend_loc='none', legend_fontsize=6, fontsize=9,
                                 save='allages_tendonfibro_streamMap_umap2.svg',
                                 figsize=(5,5),
                                 dpi=300)

In [None]:
scv.pl.velocity_embedding_stream(adata, 
                                 basis='umap', 
                                 color='age',
                                 size=5, density=2.5,
                                 min_mass=0, arrow_size=0.8,
                                 legend_loc='none', legend_fontsize=6, fontsize=9,
                                 save='allages_tendonfibro_streamMap_age_umap2.svg',
                                 figsize=(5,5),
                                 dpi=300)

In [None]:
scv.pl.velocity(adata, ['TNMD', 'MKX', 'SCX', 'FMOD', 'EGR1', 'COL1A1', 'COL3A1'], 
                basis='umap', color='C_scANVI', add_outline=True #, cmap='coolwarm'
               )

In [None]:
scv.tl.rank_dynamical_genes(adata, groupby='C_scANVI')

In [None]:
result = adata.uns['rank_dynamical_genes']
groups = result['names'].dtype.names
df = pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups 
    for key in ['names','scores']})
#df.to_csv(os.path.join(RESULTS_FOLDERNAME, '20w_tendonfibro_highvelocity_genes_diffkinetics.csv'))
df.head(5)

In [None]:
df = scv.get_df(adata, 'rank_dynamical_genes/names')
df.head(5)

In [None]:
for celltype in groups:
    scv.pl.scatter(adata, df[celltype][:5], ylabel=celltype, frameon=False, 
                   color='C_scANVI', dpi=150, add_outline=True)

## Differentiation Speed

The speed or rate of differentiation is given by the length of the velocity vector. The coherence of the vector field (i.e., how a velocity vector correlates with its neighboring velocities) provides a measure of confidence.

In [None]:
scv.tl.velocity_confidence(adata)

In [None]:
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata, basis='umap', c = keys, perc = [5,95], cmap='coolwarm',
              save='allages_magnitude_and_confidence2.svg', figsize=(5,4))

In [None]:
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata, basis='umap_orig', c = keys, perc = [5,95], cmap='coolwarm',
              save='allages_magnitude_and_confidence_umap2.svg', figsize=(5,4))

In [None]:
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata, basis='draw_graph_fa', c = keys, perc = [5,95], cmap='coolwarm',
              save='allages_magnitude_and_confidence_fagraph2.svg', figsize=(5,4))

In [None]:
df = adata.obs.groupby('C_scANVI')[keys].mean().T
df.style.background_gradient(cmap='coolwarm', axis=1)

In [None]:
# velocity graph showing all velocity-inferred cell-to-cell connections/transitions.
scv.pl.velocity_graph(adata, threshold=.3, color='C_scANVI', legend_loc='right margin')

In [None]:
df = adata.var
df = df[(df['fit_likelihood'] > .1) & df['velocity_genes'] == True]

kwargs = dict(xscale='log', fontsize=16)
with scv.GridSpec(ncols=3) as pl:
    pl.hist(df['fit_alpha'], xlabel='transcription rate', **kwargs)
    pl.hist(df['fit_beta'] * df['fit_scaling'], xlabel='splicing rate', xticks=[.1, .4, 1], **kwargs)
    pl.hist(df['fit_gamma'], xlabel='degradation rate', xticks=[.1, .4, 1], **kwargs)

scv.get_df(adata, 'fit*', dropna=True).head()

## Latent Time

In [None]:
scv.tl.latent_time(adata)

In [None]:
scv.pl.scatter(adata, basis='umap', color='latent_time', color_map='gnuplot', size=30,
               save="allages_tendonfibro_scvelo_latent_time2.svg", 
               show=True)

In [None]:
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:300]
top_genes

In [None]:
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', 
               color_map='magma',
               col_color='C_scANVI', 
               n_convolve=100,
               figsize=(10,5),
               save='allages_latent_heatmap2.png'
              )

In [None]:
scv.pl.heatmap(adata, var_names=top_genes, sortby='velocity_pseudotime', 
               color_map='magma',
               col_color='C_scANVI', 
               n_convolve=100,
               figsize=(10,5),
               save='allages_velocity_pseudotime_heatmap2.png'
              )

In [None]:
scv.pl.heatmap(adata, var_names=top_genes, sortby='palantir_pseudotime', 
               color_map='magma',
               col_color='C_scANVI', 
               n_convolve=100,
               figsize=(10,5),
               save='allages_palantir_pseudotime_heatmap2.png'
              )

In [None]:
scv.pl.scatter(adata, top_genes[:5], frameon=False, color='C_scANVI')
scv.pl.scatter(adata, x='latent_time', y=top_genes[:5], frameon=False, color='C_scANVI')

In [None]:
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, basis='umap', color='velocity_pseudotime', save="velo_pseudotime2.svg", cmap='gnuplot')
#scv.pl.scatter(adata, basis='draw_graph_fa', color='velocity_pseudotime', save="velo_pseudotime_FA.svg", cmap='gnuplot')

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_velocity2.h5ad'))

# CellRank

## Identifying Probable Terminal and Initial States

In [None]:
adata = scv.read(os.path.join(HARMONY_FOLDERNAME, 'allages_tendonfibro2.h5ad'), cache=False)
adata

In [None]:
adata=scv.read(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_velocity2.h5ad'), cache=True)
adata.var_names_make_unique()
adata

In [None]:
sc.pp.neighbors(adata, n_pcs=15, n_neighbors=30, random_state=0)
sc.tl.diffmap(adata)

In [None]:
diffmapdf = pd.DataFrame(adata.obsm['X_diffmap'], index=adata.obs_names)
diffmapdf

In [None]:
#cellid = adata.obsm['X_diffmap'][:, 8].argmax()
#cellid = adata.obsm['X_msdiff'][:, 4].argmax()
#cell = adata.obs.index[cellid]


# cell = 'AACCCAAAGCCAAGTG.DEV16134_Quad' # Highest original SCX expression:
cell = 'ACTTAGGGTTGTCCCT.DEV16127_Ach'# Highest MKX
# cell = 'CGGTCAGGTACCTGTA.DEV16569_Ach' # Highest NES
# cell = 'AGTCTCCCAGGTTTAC.DEV15985_Quad' # Highest TNMD

#cell = 'TCCCACACACAATGAA.DEV16135DEV16171_Quad' # highest TNP and division marker combined score
cellid = adata.obs.index.get_loc(cell)
print(adata.obs[adata.obs.index == cell][['C_scANVI', 'phase', 'age']])

root_ixs = cellid 
scv.pl.scatter(
    adata,
    basis="msdiff",
    c=["age", "C_scANVI", root_ixs],
    legend_loc="right",
    components=["2, 3"],
)

adata.uns["iroot"] = root_ixs

In [None]:
scv.pl.scatter(
    adata,
    basis="msdiff",
    c=["age", "C_scANVI", root_ixs],
    legend_loc="right",
    components=["1,2"],
)

In [None]:
scv.pl.scatter(
    adata,
    basis="msdiff",
    c=["age", "C_scANVI", root_ixs],
    legend_loc="right",
    components=["3,4"],
)

In [None]:
sc.pl.umap(
    adata,
    ncols=2,
    color=["palantir_pseudotime", "C_scANVI"],
    frameon=False,
    color_map="gnuplot2",
    
)

In [None]:
adata.raw = adata.copy()

In [None]:
sc.pl.violin(adata, keys="palantir_pseudotime", 
             groupby="C_scANVI", rotation=90)

In [None]:
sc.tl.dpt(adata)

In [None]:
sc.pl.umap(
    adata,
    ncols=2,
    color=["dpt_pseudotime", "velocity_pseudotime", "palantir_pseudotime", "C_scANVI"],
    frameon=False,
    color_map="gnuplot2",
    save='allages_tendonfibros_pseudotimes2.svg'
)

In [None]:
sc.pl.violin(adata, keys=["dpt_pseudotime", 
                          "velocity_pseudotime",
                          "palantir_pseudotime"], 
             groupby="C_scANVI", rotation=90)

In [None]:
abi3_trajectory = ["COL6A6 FNDC1 Fibroblasts", "ABI3BP GAS2 Fibroblasts 2", "ABI3BP GAS2 Fibroblasts 1"]
col3_trajectory = ["COL6A6 FNDC1 Fibroblasts", "COL3A1 PI16 Fibroblasts",]
fgf14_trajectory = ["FGF14 THBS4 Fibroblasts", "ABI3BP GAS2 Fibroblasts 1"]

mask = np.in1d(adata.obs["C_scANVI"], abi3_trajectory)
sc.pl.violin(
    adata[mask],
    keys=["dpt_pseudotime", "velocity_pseudotime", "palantir_pseudotime"],
    groupby="C_scANVI",
    rotation=-90,
    order=abi3_trajectory,
)

mask = np.in1d(adata.obs["C_scANVI"], col3_trajectory)
sc.pl.violin(
    adata[mask],
    keys=["dpt_pseudotime", "velocity_pseudotime", "palantir_pseudotime"],
    groupby="C_scANVI",
    rotation=-90,
    order=col3_trajectory,
)

mask = np.in1d(adata.obs["C_scANVI"], fgf14_trajectory)
sc.pl.violin(
    adata[mask],
    keys=["dpt_pseudotime", "velocity_pseudotime", "palantir_pseudotime"],
    groupby="C_scANVI",
    rotation=-90,
    order=fgf14_trajectory,
)

### Selecting Kernels

In [None]:
import cellrank as cr
from cellrank.tl.kernels import VelocityKernel
from cellrank.tl.kernels import PseudotimeKernel
from cellrank.tl.kernels import ConnectivityKernel

#vk = VelocityKernel(adata).compute_transition_matrix()
#ck = ConnectivityKernel(adata).compute_transition_matrix()
#combined_kernel = 0.8 * vk + 0.2 * ck
#print(combined_kernel)

In [None]:
pk = PseudotimeKernel(adata, time_key="palantir_pseudotime").compute_transition_matrix()

In [None]:
pk.plot_random_walks(n_sims=300, start_ixs={"C_scANVI": "ABI3BP GAS2 Fibroblasts"}, 
                     max_iter=1000, seed=0, basis='umap', s=100,
                     figsize=(7,7), save='ABI_1_random_walks_palantirpseudo.svg')

In [None]:
#vk.plot_random_walks(n_sims=300, start_ixs={"C_scANVI": "ABI3BP GAS2 Fibroblasts 2"}, 
#                     max_iter=1000, seed=0, basis='umap', s=100,
#                    figsize=(7,7))

### Estimating (Palantir Pseudotime Kernel)

In [None]:
from cellrank.tl.estimators import GPCCA

g2 = GPCCA(pk)
g2.compute_schur(n_components=20)
g2.plot_spectrum(real_only=True, show_eigengap=True)

In [None]:
g2.compute_macrostates(n_states=8, cluster_key="C_scANVI")
g2.plot_macrostates(discrete=True,
                   legend_fontsize=9, 
                   basis='umap', s=100, legend_loc='right margin',
                   #save='allages_tendonfibro_macrostates_discrete.svg',
                   figsize=(5,4))

In [None]:
g2.plot_coarse_T()

In [None]:
g2.plot_macrostates(same_plot=False, basis='umap',
                    ncols=3,
                    #save='allages_tendonfibro_macrostates_separated.svg',
                   )

In [None]:
sc.pl.umap(adata, color=['age', 'phase', 'C_scANVI'], 
           frameon=False)

In [None]:
#sc.tl.louvain(adata, resolution=0.2, key_added='fibro_louvain02')
sc.pl.umap(adata, color=['fibro_louvain02', 'fibro_louvain04'], frameon=False, legend_loc='on data')

In [None]:
sc.pl.embedding(adata, color=['fibro_louvain02', 'fibro_louvain04'], basis='umap_orig', frameon=False,
               legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata, groupby='fibro_louvain02', method='wilcoxon', key_added='wilcoxon_fiblou02', 
                        use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key='wilcoxon_fiblou02')

In [None]:
sc.pl.embedding(adata, color=['PRG4', 'CREB5', 'MKX', 'EGR1', 'PTCH2', 'EBF2', 'COL4A1', 'POSTN', 'NEGR1',
                             'COL6A6', 'PDGFRA', 'BMP5', 'TSHZ2', 'FSTL5'], basis='umap', 
                             vmin=0, vmax="p99", sort_order=False,
                             cmap="Reds", use_raw=False, layer='log1p_norm', frameon=False)

In [None]:
scv.tl.rank_dynamical_genes(adata, groupby='fibro_louvain02')
result = adata.uns['rank_dynamical_genes']
groups = result['names'].dtype.names
df = scv.get_df(adata, 'rank_dynamical_genes/names')
for celltype in groups:
    scv.pl.scatter(adata, df[celltype][:5], ylabel=celltype, frameon=False, 
                   color='fibro_louvain02', dpi=150, add_outline=True)

In [None]:
g2.compute_terminal_states()
#g2.set_terminal_states_from_macrostates([
#                                         'COL3A1 PI16 Fibroblasts_1', 'COL3A1 PI16 Fibroblasts_2',
#                                         'COL3A1 PI16 Fibroblasts_3', 'COL3A1 PI16 Fibroblasts_4',
#                                         'FGF14 THBS4 Fibroblasts', 'ABI3BP GAS2 Fibroblasts_3', 
#                                       ])
adata.obs['terminal_states'].value_counts()

In [None]:
g2.compute_absorption_probabilities()
cr.pl.circular_projection(adata, keys=['age', 'C_scANVI'],
                          legend_loc="right", s=10, alpha=0.9,
                          save='allages_tendonfibro_circular_projection_pseudokernel2.svg',
                          figsize=(25,25))

In [None]:
cr.pl.circular_projection(adata, keys=['fibro_louvain02', 'C_scANVI'],
                          legend_loc="right", s=10, alpha=0.9,
                          figsize=(25,25))

In [None]:
cr.pl.circular_projection(adata, keys=['phase', 'C_scANVI'],
                          legend_loc="right", s=5,
                          save='allages_tendonfibro_circular_projection_pseudokernel2.svg',
                          figsize=(25,25))

In [None]:
lin_drivers2 = g2.compute_lineage_drivers()
lin_drivers2.to_csv(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro_pseudokernel_lineagedrivers.csv'))

In [None]:
lineages = list(adata.obs['terminal_states'].cat.categories)
for i in lineages:
    g2.plot_lineage_drivers(i, n_genes=8, basis='umap', vmax=10,
                   cmap='plasma', save=f'allages_fibro_palantirlind_{i}.svg'
                   )
    print(i)

In [None]:
lin_drivers_dict={}
for i in g2.terminal_states.cat.categories:
    lin_drivers_dict[i] = g2.compute_lineage_drivers(lineages=i,return_drivers=True)
    lin_drivers_dict[i].to_csv(os.path.join(RESULTS_FOLDERNAME, f'allages_tendonfibro_pseudokernel_lineagedrivers_{i}.csv'))

In [None]:
g2.compute_lineage_drivers()

In [None]:
# define set of genes to annotate
genes_oi = {
    "COL3A1 PI16 Fibroblasts_2": list(lin_drivers_dict['COL3A1 PI16 Fibroblasts_2'].index[:15]),
    "COL3A1 PI16 Fibroblasts_1": list(lin_drivers_dict['COL3A1 PI16 Fibroblasts_1'].index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g2.plot_lineage_drivers_correlation(
    lineage_x="COL3A1 PI16 Fibroblasts_2",
    lineage_y="COL3A1 PI16 Fibroblasts_1",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
    save='COL3_1_vs_COL3_2.svg'
)

In [None]:
# define set of genes to annotate
genes_oi = {
    "COL3A1 PI16 Fibroblasts_3": list(lin_drivers_dict['COL3A1 PI16 Fibroblasts_3'].index[:15]),
    "COL3A1 PI16 Fibroblasts_2": list(lin_drivers_dict['COL3A1 PI16 Fibroblasts_2'].index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g2.plot_lineage_drivers_correlation(
    lineage_x="COL3A1 PI16 Fibroblasts_3",
    lineage_y="COL3A1 PI16 Fibroblasts_2",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
    save='COL3_3_vs_COL3_2.svg'
)

In [None]:
# define set of genes to annotate
genes_oi = {
    "COL3A1 PI16 Fibroblasts_1": list(lin_drivers_dict['COL3A1 PI16 Fibroblasts_1'].index[:15]),
    "COL3A1 PI16 Fibroblasts_3": list(lin_drivers_dict['COL3A1 PI16 Fibroblasts_3'].index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g2.plot_lineage_drivers_correlation(
    lineage_x="COL3A1 PI16 Fibroblasts_1",
    lineage_y="COL3A1 PI16 Fibroblasts_3",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
    save='COL3_1_vs_COL3_3.svg'
)

In [None]:
# define set of genes to annotate
genes_oi = {
    "ABI3BP GAS2 Fibroblasts 1_1": list(lin_drivers_dict['ABI3BP GAS2 Fibroblasts 1_1'].index[:15]),
    "ABI3BP GAS2 Fibroblasts 1_2": list(lin_drivers_dict['ABI3BP GAS2 Fibroblasts 1_2'].index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g2.plot_lineage_drivers_correlation(
    lineage_x="ABI3BP GAS2 Fibroblasts 1_1",
    lineage_y="ABI3BP GAS2 Fibroblasts 1_2",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
    save='ABI3_1_1_vs_ABI3_1_2.svg'
)

In [None]:
# define set of genes to annotate
genes_oi = {
    "COL6A6 FNDC1 Fibroblasts": list(lin_drivers_dict['COL6A6 FNDC1 Fibroblasts'].index[:15]),
    "FGF14 THBS4 Fibroblasts": list(lin_drivers_dict['FGF14 THBS4 Fibroblasts'].index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g2.plot_lineage_drivers_correlation(
    lineage_x="COL6A6 FNDC1 Fibroblasts",
    lineage_y="FGF14 THBS4 Fibroblasts",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
    save='COL6_vs_FGF14.svg'
)

In [None]:
pk_bk = PseudotimeKernel(adata, time_key="palantir_pseudotime", backward=True).compute_transition_matrix()
g_bk2 = GPCCA(pk_bk)
g_bk2.compute_schur(n_components=20)
g_bk2.plot_spectrum(real_only=True, show_eigengap=True)

In [None]:
g_bk2.compute_macrostates(n_states=1, cluster_key="C_scANVI")
g_bk2.plot_macrostates(legend_fontsize=9, basis='umap', discrete=True,
                   save='allages_tendonfibro_macrostates_initial2.svg',
                   figsize=(5,4))

In [None]:
g_bk2.plot_macrostates(same_plot=False, legend_fontsize=9, basis='umap',
                   save='allages_tendonfibro_macrostates_initial_discrete2.svg',
                   figsize=(5,4))

In [None]:
g_bk2.compute_terminal_states()
g_bk2.compute_absorption_probabilities()
adata.obs['initial_states'].value_counts()

In [None]:
scv.tl.recover_latent_time(
    adata, root_key="initial_states_probabilities", end_key="terminal_states_probabilities"
)
scv.tl.paga(
    adata,
    groups="C_scANVI",
    threshold_root_end_prior=0.9,
    root_key="initial_states_probabilities",
    end_key="terminal_states_probabilities",
    use_time_prior="palantir_pseudotime",
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="bar",
    cluster_key="C_scANVI",
    backward=False,
    ncols=5,
    figsize=(15,3),
    save='allages_tendonfibro_palantirpseudotime_directedPAGA_bar2.svg',
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="paga_pie",
    cluster_key="C_scANVI",
    backward=False,
    basis="umap",
    legend_kwargs={"loc": "top right out"},
    legend_loc="top left out",
    node_size_scale=7,
    edge_width_scale=10,
    max_edge_width=10,
    figsize=(7,7),
    title="directed PAGA",
    save='allages_tendonfibro_palantir_pseudotime_directedPAGA_umap2.svg',
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="paga_pie",
    cluster_key="C_scANVI",
    backward=False,
    basis="draw_graph_fa",
    legend_kwargs={"loc": "top right out"},
    legend_loc="top left out",
    node_size_scale=7,
    edge_width_scale=10,
    max_edge_width=10,
    figsize=(7,7),
    title="directed PAGA",
    save='allages_tendonfibro_palantir_pseudotime_directedPAGA_fa2.svg',
)

In [None]:
#scv.tl.recover_latent_time(
#    adata, root_key="initial_states_probabilities", end_key="terminal_states_probabilities"
#)
scv.tl.paga(
    adata,
    groups="fibro_louvain02",
    threshold_root_end_prior=0.9,
    root_key="initial_states_probabilities",
    end_key="terminal_states_probabilities",
    use_time_prior="palantir_pseudotime",
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="bar",
    cluster_key="fibro_louvain02",
    backward=False,
    ncols=5,
    figsize=(15,3),
    #save='allages_tendonfibro_velocitypseudotime_directedPAGA_bar.svg',
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="paga_pie",
    cluster_key="fibro_louvain02",
    backward=False,
    basis="umap",
    legend_kwargs={"loc": "top right out"},
    legend_loc="top left out",
    node_size_scale=7,
    edge_width_scale=10,
    max_edge_width=10,
    figsize=(7,7),
    title="directed PAGA",
    #save='allages_tendonfibro_palantir_pseudotime_directedPAGA_umap2.svg',
)

In [None]:
scv.tl.paga(
    adata,
    groups="C_scANVI",
    threshold_root_end_prior=0.9,
    root_key="initial_states_probabilities",
    end_key="terminal_states_probabilities",
    use_time_prior="velocity_pseudotime",
)

cr.pl.cluster_fates(
    adata,
    mode="bar",
    cluster_key="C_scANVI",
    backward=False,
    ncols=5,
    figsize=(15,3),
    save='allages_tendonfibro_velocitypseudotime_directedPAGA_bar.svg',
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="paga_pie",
    cluster_key="C_scANVI",
    backward=False,
    basis="umap",
    legend_kwargs={"loc": "top right out"},
    legend_loc="top left out",
    node_size_scale=7,
    edge_width_scale=10,
    max_edge_width=10,
    figsize=(7,7),
    title="directed PAGA",
    save='allages_tendonfibro_velocitypseudotime_directedPAGA_umap.svg',
)

In [None]:
scv.tl.paga(
    adata,
    groups="fibro_louvain02",
    threshold_root_end_prior=0.9,
    root_key="initial_states_probabilities",
    end_key="terminal_states_probabilities",
    use_time_prior="velocity_pseudotime",
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="bar",
    cluster_key="fibro_louvain02",
    backward=False,
    ncols=5,
    figsize=(15,15),
    #save='allages_tendonfibro_velocitypseudotime_directedPAGA_bar.svg',
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="paga_pie",
    cluster_key="fibro_louvain02",
    backward=False,
    basis="umap",
    legend_kwargs={"loc": "top right out"},
    legend_loc="top left out",
    node_size_scale=7,
    edge_width_scale=10,
    max_edge_width=10,
    figsize=(7,7),
    title="directed PAGA",
    #save='allages_tendonfibro_velocitypseudotime_directedPAGA_umap.svg',
)

In [None]:
import warnings
warnings.filterwarnings("ignore")

model = cr.models.GAM(adata, n_knots=6)
cr.pl.gene_trends(
    adata,
    model=model,
    data_key="MAGIC_imputed_data",
    genes=["MKX", "TNMD", "FMOD", "COL1A1", "COL3A1", "KERA"],
    same_plot=True,
    ncols=2,
    time_key="palantir_pseudotime",
    hide_cells=True,
    weight_threshold=(1e-3, 1e-3),
    save='allages_tendonfibro_palantir_pseudotime_genetrendstenocyte.svg',
)

In [None]:
lin_drivers_dict={}
for i in g2.terminal_states.cat.categories:
    lin_drivers_dict[i] = g2.compute_lineage_drivers(lineages=i,return_drivers=True)

for i in g2.terminal_states.cat.categories:
    # plot heatmap
    cr.pl.heatmap(
        adata,
        model=model,  # use the model from before
        lineages=i,
        cluster_key="C_scANVI",
        data_key="MAGIC_imputed_data",
        genes=lin_drivers_dict[i].head(40).index,
        time_key="palantir_pseudotime",
        figsize=(12, 10),
        show_all_genes=True,
        weight_threshold=(1e-3, 1e-3),
        save=f'{i}_lineage_heatmap.svg'
    )

### Estimating (Velocity + Connectivities Kernel)

In [None]:
g = GPCCA(combined_kernel)
g.compute_schur(n_components=20)
g.plot_spectrum(real_only=True, show_eigengap=True)

Given that the eigengap is at component 2 and components 0, 1, and 2 have relatively high Re values, I specify three macrostates. This choice captures the clear separation of the first two macrostates and allows for a third state to capture finer-scale transitions.ABI3BP GAS2 Fibroblasts 1 and COL3A1 PI16 Fibroblasts are the two dominant macrostates.

In [None]:
g.compute_macrostates(n_states=6, cluster_key="C_scANVI")
g.plot_macrostates(discrete=True,
                   legend_fontsize=9, 
                   basis='umap', s=100,
                   #save='20w_macrostates_discrete.svg',
                   figsize=(5,4))

In [None]:
g.plot_macrostates(discrete=True,
                   legend_fontsize=9, 
                   basis='umap', s=100,
                   #save='allages_tendonfibro_macrostates_velocity_umap.svg',
                   figsize=(5,4))

In [None]:
g.plot_macrostates(same_plot=False, basis='umap',
                   #save='allages_tendonfibro_macrostates_separated_velocity_umap.svg',
                  )

In [None]:
#g.predict()

In [None]:
g.compute_terminal_states()
#g.set_terminal_states_from_macrostates(['COL3A1 PI16 Fibroblasts_1', 'ABI3BP GAS2 Fibroblasts 1_2'])
adata.obs['terminal_states'].value_counts()

In [None]:
g.compute_absorption_probabilities()
cr.pl.circular_projection(adata, keys=['phase', 'C_scANVI'],
                          legend_loc="right", s=100, 
                          #save='20w_circular_projection.svg',
                          figsize=(25,25))

In [None]:
cr.pl.circular_projection(adata, keys=['age', 'C_scANVI'],
                          legend_loc="right", s=100, 
                          #save='20w_circular_projection.svg',
                          figsize=(25,25))

### Computing lineage drivers
We can compute the driver genes for all or just a subset of lineages. We can also restrict this to some subset of clusters by specifying clusters=... (not shown below). In the resulting dataframe, we also see the p-value, the corrected p-value (q-value) and the 95% confidence interval for the correlation statistic.

In [None]:
lin_drivers = g.compute_lineage_drivers()

In [None]:
lineages = list(adata.obs['terminal_states'].cat.categories)
for i in lineages:
    g.plot_lineage_drivers(i, n_genes=8, basis='umap',
                   cmap='magma', save=f'allages_fibro_velocitylind_{i}.svg'
                   )

In [None]:
# define set of genes to annotate
genes_oi = {
    "ABI3BP GAS2 Fibroblasts 1": list(lin_drivers_abi3.index[:15]),
    "FGF14 THBS4 Fibroblasts": list(lin_drivers_fgf14.index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g.plot_lineage_drivers_correlation(
    lineage_x="FGF14 THBS4 Fibroblasts",
    lineage_y="ABI3BP GAS2 Fibroblasts 1",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
)

In [None]:
# define set of genes to annotate
genes_oi = {
    "ABI3BP GAS2 Fibroblasts 1": list(lin_drivers_abi3.index[:15]),
    "COL3A1 PI16 Fibroblasts": list(lin_drivers_col3.index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g.plot_lineage_drivers_correlation(
    lineage_x="ABI3BP GAS2 Fibroblasts 1",
    lineage_y="COL3A1 PI16 Fibroblasts",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
)

In [None]:
# define set of genes to annotate
genes_oi = {
    "COL3A1 PI16 Fibroblasts": list(lin_drivers_col3.index[:15]),
    "FGF14 THBS4 Fibroblasts": list(lin_drivers_fgf14.index[:15])
}

# make sure all of these exist in AnnData
assert [
    gene in adata.var_names for genes in genes_oi.values() for gene in genes
], "Did not find all genes"

# compute mean gene expression across all cells
adata.var["mean expression"] = adata.X.A.mean(axis=0)

# visualize in a scatter plot
g.plot_lineage_drivers_correlation(
    lineage_x="FGF14 THBS4 Fibroblasts",
    lineage_y="COL3A1 PI16 Fibroblasts",
    adjust_text=True,
    gene_sets=genes_oi,
    color="mean expression",
    legend_loc="none",
    figsize=(5, 5),
    dpi=150,
    fontsize=9,
    size=50,
)

### Identifying Initial States

In [None]:
vk_bk = VelocityKernel(adata, backward=True).compute_transition_matrix()
ck_bk = ConnectivityKernel(adata, backward=True).compute_transition_matrix()
combined_kernel = 0.8 * vk_bk + 0.2 * ck_bk
print(combined_kernel)

In [None]:
g_bk = GPCCA(combined_kernel)
g_bk.compute_schur(n_components=20)
g_bk.plot_spectrum(real_only=True, show_eigengap=True)

In [None]:
g_bk.compute_macrostates(n_states=1, cluster_key="C_scANVI")
g_bk.plot_macrostates(legend_fontsize=9, basis='umap',
                   #save='20w_macrostates_initial.svg',
                   figsize=(5,4))

In [None]:
g_bk.plot_macrostates(legend_fontsize=9, discrete=True, basis='umap',
                   #save='20w_macrostates_initialdiscrete.svg', s=100,
                   figsize=(5,4))

In [None]:
g_bk.compute_terminal_states()
g_bk.compute_absorption_probabilities()
adata.obs['initial_states'].value_counts()

In [None]:
# compute a score in scanpy by aggregating across a few ductal markers
sc.tl.score_genes(
    adata, gene_list=["SCX", "MKX", "TNMD", "FMOD", "COL1A1"], score_name="tenocyte_score"
)

# write macrostates to AnnData
adata.obs["macrostates"] = g.macrostates
adata.uns["macrostates_colors"] = g.macrostates_memberships.colors

# visualize via heatmaps
sc.pl.violin(adata, keys="tenocyte_score", groupby="macrostates", rotation=90)

In [None]:
# compute a score in scanpy by aggregating across a few ductal markers
sc.tl.score_genes(
    adata, gene_list=["COL3A1", "LUM"], score_name="col3lum_score"
)

# write macrostates to AnnData
adata.obs["macrostates"] = g.macrostates
adata.uns["macrostates_colors"] = g.macrostates_memberships.colors

# visualize via heatmaps
sc.pl.violin(adata, keys="col3lum_score", groupby="macrostates", rotation=90)

# PAGA

In [None]:
scv.tl.recover_latent_time(
    adata, root_key="initial_states_probabilities", end_key="terminal_states_probabilities"
)

scv.tl.paga(
    adata,
    groups="C_scANVI",
    threshold_root_end_prior=0.9,
    root_key="initial_states_probabilities",
    end_key="terminal_states_probabilities",
    use_time_prior="velocity_pseudotime",
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="bar",
    cluster_key="C_scANVI",
    backward=False,
    ncols=5,
    figsize=(15,3),
    #save='20w_directedPAGA_bar.svg',
)

In [None]:
cr.pl.cluster_fates(
    adata,
    mode="paga_pie",
    cluster_key="C_scANVI",
    backward=False,
    basis="umap",
    legend_kwargs={"loc": "top right out"},
    legend_loc="top left out",
    node_size_scale=7,
    edge_width_scale=2,
    max_edge_width=3,
    figsize=(7,7),
    title="directed PAGA",
    #save='20w_directedPAGA_umap.svg',
)

We use pie charts to show cell fates averaged per cluster. Edges between clusters are given by transcriptomic similarity between the clusters, just as in normal PAGA.

In [None]:
model = cr.models.GAM(adata, n_knots=6)

In [None]:
model = cr.models.GAM(adata, n_knots=6)
cr.pl.gene_trends(
    adata,
    model=model,
    data_key="X",
    genes=["SCX", "MKX", "COL1A1", "COL3A1", "FMOD", "KERA", "LUM"],
    same_plot=True,
    ncols=2,
    time_key="dpt_pseudotime",
    hide_cells=True,
    weight_threshold=(1e-3, 1e-3),
)

In [None]:
# plot heatmap
cr.pl.heatmap(
    adata,
    model=model,  # use the model from before
    lineages="COL3A1 PI16 Fibroblasts",
    cluster_key="C_scANVI",
    data_key="X",
    genes=lin_drivers_col3.head(40).index,
    time_key="velocity_pseudotime",
    figsize=(12, 10),
    show_all_genes=True,
    weight_threshold=(1e-3, 1e-3),
)

In [None]:
cr.pl.heatmap(
    adata,
    model=model,  # use the model from before
    lineages="FGF14 THBS4 Fibroblasts",
    cluster_key="C_scANVI",
    data_key="X",
    genes=lin_drivers_fgf14.head(40).index,
    time_key="velocity_pseudotime",
    figsize=(12, 10),
    show_all_genes=True,
    weight_threshold=(1e-3, 1e-3),
)

In [None]:
cr.pl.heatmap(
    adata,
    model=model,  # use the model from before
    lineages="ABI3BP GAS2 Fibroblasts 1",
    cluster_key="C_scANVI",
    data_key="X",
    genes=lin_drivers_abi3.head(40).index,
    time_key="velocity_pseudotime",
    figsize=(12, 10),
    show_all_genes=True,
    weight_threshold=(1e-3, 1e-3),
)

# SCFATES

In [None]:
import scFates as scf

scf.tl.tree(adata,method="ppt",Nodes=200,use_rep="msdiff",
            device="cpu",seed=1,ppt_lambda=100,ppt_sigma=0.025,ppt_nsteps=200)

In [None]:
scf.pl.graph(adata, basis='umap')

In [None]:
scf.tl.root(adata,89)

In [None]:
scf.tl.pseudotime(adata,n_jobs=10,n_map=1000,seed=42)