In [None]:
'''
Goal:Check tabula sapiens lung for vessel_size_gradient
'''

In [None]:
import scanpy as sc
import scanpy.external as sce
import os 
import pandas as pd 
import numpy as np
import seaborn as sns
from functions import compare_obs_values_within_groups_to_excel
import matplotlib.pyplot as plt
import palantir
# from statannotations.Annotator import Annotator

adata_name='venous_ec'
figures = "data/figures/figures/tabula_muris_senis"
data = "data/single_cell_files/scanpy_files"

os.makedirs(figures, exist_ok=True)
sc.set_figure_params(dpi_save=300, fontsize=10, figsize=(1.5,1.5))
sc.settings.figdir = figures
sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})
plt.rcParams["font.family"] = "Arial"
size=15

In [None]:
adata_ts = sc.read('/home/carsten/alvira_bioinformatics/data/external_datasets/tabula_muris_senis/raw_data/tabula_muris_senis_lung_droplet.h5ad')
# adata_ts.X = adata_ts.layers['log_normalized'].copy()
adata_ts.X
del adata_ts.raw

sc.pp.normalize_total(adata_ts,target_sum=1e4)
sc.pp.log1p(adata_ts)
adata_ts.var_names = adata_ts.var['feature_name'].str.split('_').str[0]
adata_ts.var_names_make_unique()
adata_ts.var

In [None]:
adata_ts.obs['free_annotation'].cat.categories
vessel_cts = ['Artery','Capillary','Vein']
adata_ts_vessel = adata_ts[adata_ts.obs['free_annotation'].isin(vessel_cts)]
adata_ts_vessel.obs['Cell Subtype'] = adata_ts_vessel.obs['free_annotation'].copy()
adata_ts_vessel.obs['Cell Subtype'].replace({'Artery':'Arterial EC','Capillary':'Cap1','Vein':'Venous EC'},inplace=True)
adata_ts_vessel.uns['Cell Subtype_colors']= ['#4A90E2','#9B59B6','#E35D6A']
sc.pp.highly_variable_genes(adata_ts_vessel)
sc.pp.pca(adata_ts_vessel, mask_var="highly_variable")
sce.pp.harmony_integrate(adata_ts_vessel,'donor_id',max_iter_harmony = 20)
sc.pp.neighbors(adata_ts_vessel, use_rep="X_pca_harmony")
sc.tl.leiden(adata_ts_vessel, resolution=0.5)
sc.tl.rank_genes_groups(adata_ts_vessel,'leiden',method='wilcoxon')
sc.tl.umap(adata_ts_vessel, min_dist=1)
for color in ['donor_id','free_annotation','age','leiden','Gja5','Slc6a2','Plvap']:
    sc.pl.umap(adata_ts_vessel,color=color)
sc.pl.rank_genes_groups_dotplot(adata_ts_vessel,n_genes=10)
for color in ['free_annotation','leiden','Cdh5','Ptprc','Col1a1','Prox1','Apln','Aplnr','Gja5','Plvap','Col15a1','Tbx2','Kit','Fbln5','Scn7a','Mecom','Mgp','Vwf','Ncam1','Ncam2']:
    sc.pl.umap(adata_ts_vessel,color=color)

In [None]:
sc.pl.dotplot(adata_ts_vessel,['Ackr1','Hdac9','Emcn','Vwf'],groupby=['Cell Subtype'],title='Vascular endos in Tabula muris senis')

In [None]:
import palantir
import cellrank as cr
import scvelo as scv

root_ct = 'Cap1'
terminal_cts = ['Arterial EC','Venous EC']
celltype='Cell Subtype'

palantir.utils.run_diffusion_maps(adata_ts_vessel,
                                           n_components=5)
fig = palantir.plot.plot_diffusion_components(adata_ts_vessel)[0]
fig.tight_layout()
fig.savefig(f'{figures}/ts_palantir_diffusion_components.png')
plt.close()
palantir.utils.determine_multiscale_space(adata_ts_vessel)

palantir.utils.run_magic_imputation(adata_ts_vessel)
subset = adata_ts_vessel[adata_ts_vessel.obs[celltype] == root_ct]
umap1_values = subset.obsm['X_umap'][:, 1]
max_idx = np.argmax(umap1_values)
root_cell = subset.obs_names[max_idx]
terminal_states = []
for ct in terminal_cts:
    subset = adata_ts_vessel[adata_ts_vessel.obs[celltype] == ct]
    if ct =='Arterial EC':
        # Get the index (obs_names) of the cell with the min UMAP1 (usually component 0)
        umap1_values = subset.obsm['X_umap'][:, 1]
        min_idx = np.argmin(umap1_values)
        # Return the cell name
        terminal_states.append(subset.obs_names[min_idx])
    else:
        umap1_values = subset.obsm['X_umap'][:, 0]
        min_idx = np.argmin(umap1_values)
        # Return the cell name
        terminal_states.append(subset.obs_names[min_idx])
        
terminal_states = pd.Series(index=terminal_states, data=terminal_cts, dtype='object')

fig = palantir.plot.highlight_cells_on_umap(adata_ts_vessel, [root_cell]+terminal_states)[0]
fig.tight_layout()
fig.savefig(f'{figures}/ts_palantir_terminal_cells.png')
plt.close()

palantir.core.run_palantir(
    adata_ts_vessel, root_cell, num_waypoints=500, terminal_states=terminal_states
)

fig = palantir.plot.plot_palantir_results(adata_ts_vessel, s=3)
fig.tight_layout()
fig.savefig(f'{figures}/ts_palantir_results.png')
plt.close()
iroot = adata_ts_vessel.obs.index.get_loc(root_cell)
adata_ts_vessel.uns["iroot"] = iroot
sc.tl.dpt(adata_ts_vessel)

try:
    palantir.presults.select_branch_cells(adata_ts_vessel, q=.01, eps=.01,pseudo_time_key='dpt_pseudotime')

    fig = palantir.plot.plot_branch_selection(adata_ts_vessel)
    fig.tight_layout()
    fig.savefig(f'{figures}/ts_palantir_branch_selection.png')
    plt.close()

except:
    pass

sc.tl.diffmap(adata_ts_vessel)
scv.pl.scatter(
    adata_ts_vessel,
    basis="diffmap",
    c=[celltype, iroot],
    legend_loc="right",
    components=["2, 3"],
    show=False,
    save=f'ts_diffmap_{celltype}_root_cell.png'
)


sc.pl.embedding(
    adata_ts_vessel,
    basis="umap",
    color=["dpt_pseudotime", "palantir_pseudotime"],
    color_map="viridis",
    show=False,
    save='_ts__pseudotimes.png'
)

palantir.presults.compute_gene_trends(
    adata_ts_vessel,
    expression_key="MAGIC_imputed_data",
    pseudo_time_key='dpt_pseudotime'
)

pk = cr.kernels.PseudotimeKernel(adata_ts_vessel, time_key="palantir_pseudotime")
pk.compute_transition_matrix()
pk.plot_projection(basis="umap", color=celltype, recompute=True,legend_loc='right margin',
                         save=f'{figures}/ts_palantir_pseudotime_stream.png')


In [None]:
import pandas as pd
from scipy.stats import spearmanr

def correlate_genes_with_pseudotime(adata, layer=None, method='spearman',pseudotime='dpt_pseudotime'):
    """
    Correlates all genes with pseudotime in an AnnData object.

    Parameters:
    - adata: AnnData object with pseudotime in `adata.obs['pseudotime']`
    - layer: (Optional) Layer to use instead of adata.X (e.g., 'log1p', 'counts')
    - method: Correlation method, either 'spearman' (default) or 'pearson'

    Returns:
    - pandas DataFrame with genes as index and columns: ['correlation', 'pval']
    """
    if pseudotime not in adata.obs:
        raise ValueError("Pseudotime must be stored in adata.obs['pseudotime'].")

    # Get expression matrix
    X = adata.X if layer is None else adata.layers[layer]
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X.toarray() if hasattr(X, "toarray") else X,
                         index=adata.obs_names, columns=adata.var_names)

    # Extract pseudotime
    pseudotime = adata.obs[pseudotime]

    # Run correlation
    results = []
    for gene in X.columns:
        if method == 'spearman':
            corr, pval = spearmanr(X[gene], pseudotime)
        elif method == 'pearson':
            corr, pval = X[gene].corr(pseudotime), None  # Pearson p-value not computed here
        else:
            raise ValueError("Method must be 'spearman' or 'pearson'.")
        results.append((gene, corr, pval))

    result_df = pd.DataFrame(results, columns=['gene', 'correlation', 'pval']).set_index('gene')
    return result_df.sort_values('correlation', ascending=False)

In [None]:
corr_dfs = {}
for ct in ['Arterial EC','Venous EC']:
    ct_adata = adata_ts_vessel[adata_ts_vessel.obs['Cell Subtype']==ct]
    df = correlate_genes_with_pseudotime(ct_adata,method='pearson',pseudotime='dpt_pseudotime')
    corr_dfs[ct]=df.dropna(how='all')

In [None]:
corr_dfs['Arterial EC']

In [None]:
corr_dfs['Venous EC']

In [None]:
top_n_genes=50
arterial_large_genes = corr_dfs['Arterial EC'].head(top_n_genes).index.tolist()
venous_large_genes = corr_dfs['Venous EC'].head(top_n_genes).index.tolist()
arterial_small_genes = corr_dfs['Arterial EC'].tail(top_n_genes).index.tolist()[::-1]
venous_small_genes = corr_dfs['Venous EC'].tail(top_n_genes).index.tolist()[::-1]

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2,venn3

# Define your lists


# Create the Venn diagram
venn = venn2([set(arterial_large_genes), set(venous_large_genes)], 
             set_labels=('Arterial', 'Venous'), 
             set_colors=('#4A90E2', '#E35D6A'), 
             alpha=0.7)

# Optional: Customize font size
for text in venn.set_labels:
    text.set_fontsize(12)
for text in venn.subset_labels:
    if text:
        text.set_fontsize(12)

# Show the plot
plt.title("Top 50 genes positively correlated with pseudotime")
plt.savefig(f'{figures}/ts_venn_diagram_large.png',dpi=300,bbox_inches='tight')
plt.close()

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

# Define your lists


# Create the Venn diagram
venn = venn2([set(arterial_small_genes), set(venous_small_genes)], 
             set_labels=('Arterial', 'Venous'), 
             set_colors=('#4A90E2', '#E35D6A'), 
             alpha=0.7)

# Optional: Customize font size
for text in venn.set_labels:
    text.set_fontsize(12)
for text in venn.subset_labels:
    if text:
        text.set_fontsize(12)

# Show the plot
plt.title("Top 50 genes negatively correlated with pseudotime")
plt.savefig(f'{figures}/ts_venn_diagram_small.png',dpi=300,bbox_inches='tight')
plt.close()

In [None]:
large_genes = [x for x in arterial_large_genes if x in venous_large_genes]
small_genes = [x for x in arterial_small_genes if x in venous_small_genes]
sc.tl.score_genes(adata_ts_vessel,large_genes,score_name='large_score')
sc.tl.score_genes(adata_ts_vessel,small_genes,score_name='small_score')
adata_ts_vessel.obs['Vessel size score'] = adata_ts_vessel.obs['large_score'] - adata_ts_vessel.obs['small_score']


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def normalize_dataframe(df):
    # Initialize the MinMaxScaler
    scaler = MinMaxScaler(feature_range=(-10, 10))
    # Fit the scaler on the data and transform each column
    df_normalized = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
    return df_normalized
adata_ts_vessel.obs['Vessel size score'] = scaler.fit_transform(adata_ts_vessel.obs[['Vessel size score']])
adata_ts_vessel.obs['Vessel size category'] = pd.cut(adata_ts_vessel.obs['Vessel size score'], bins=4,labels=['capillary','small','medium','large'])
sc.pl.umap(adata_ts_vessel,color=['Vessel size score'],cmap='Oranges',size=size,frameon=False,save='_ts_vessel_size_score.png')
sc.pl.umap(adata_ts_vessel,color=['Vessel size category'],cmap='viridis',size=size,frameon=False,save='_ts_vessel_size_category.png')
sc.pl.umap(adata_ts_vessel,color=['Cell Subtype'],cmap='viridis',size=size,legend_loc='on data',legend_fontsize=10, legend_fontoutline=1,frameon=False,save='ts_cellsubtype.png')
sc.pl.umap(adata_ts_vessel,color=['Mgp'],cmap='viridis',frameon=False,size=size,save='ts_mgp.png')
sc.pl.umap(adata_ts_vessel,color=['Col4a1'],cmap='viridis',frameon=False,size=size,save='ts_col4a1.png')
sc.pl.umap(adata_ts_vessel,color=['Col4a2'],cmap='viridis',frameon=False,size=size,save='ts_col4a2.png')
sc.pl.umap(adata_ts_vessel,color=['Eln'],cmap='viridis',frameon=False,size=size,save='ts_eln.png')
sc.pl.umap(adata_ts_vessel,color=['Fbln2'],cmap='viridis',frameon=False,size=size,save='ts_fbln2.png')



In [None]:
sc.pl.umap(adata_ts_vessel,color = large_genes + small_genes,cmap='viridis',hspace=0.3,save='ts_allsize.png')

In [None]:

genes = large_genes + small_genes
fig = palantir.plot.plot_gene_trend_heatmaps(adata_ts_vessel, genes,cmap='viridis')
fig.tight_layout()
fig.savefig(f'{figures}/ts_palantir_heatmap_gene_trends.png')
plt.close()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,2))
palantir.plot.plot_trajectory(
    adata_ts_vessel, # your anndata
    "Arterial EC", # the branch to plot
    cell_color="dpt_pseudotime", # the ad.obs colum to color the cells by
    n_arrows=5, # the number of arrow heads along the path
    color='#4A90E2', # the color of the path and arrow heads
    scanpy_kwargs=dict(cmap="viridis",size=size), # arguments passed to scanpy.pl.embedding
    arrowprops=dict(arrowstyle="->,head_length=.25,head_width=.25", lw=2), # appearance of the arrow heads
    lw=2, # thickness of the path
ax=ax
    # pseudotime_interval=(0, .9), # interval of the pseudotime to cover with the path
)
fig.tight_layout()

fig.savefig(f'{figures}/ts_palantir_art_trajectory.png')
plt.close()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,2))
palantir.plot.plot_trajectory(
    adata_ts_vessel, # your anndata
    "Venous EC", # the branch to plot
    cell_color="dpt_pseudotime", # the ad.obs colum to color the cells by
    n_arrows=5, # the number of arrow heads along the path
    color='#E35D6A', # the color of the path and arrow heads
    scanpy_kwargs=dict(cmap="viridis",size=size), # arguments passed to scanpy.pl.embedding
    arrowprops=dict(arrowstyle="->,head_length=.25,head_width=.25", lw=2), # appearance of the arrow heads
    lw=2, # thickness of the path
ax=ax
    # pseudotime_interval=(0, .9), # interval of the pseudotime to cover with the path
)
fig.tight_layout()
fig.savefig(f'{figures}/ts_palantir_ven_trajectory.png')
plt.close()

In [None]:
del adata_ts_vessel.var['feature_name']
adata_ts_vessel.write(f'{figures}/vessel_size.gz.h5ad',compression='gzip')