In [None]:
'''
Goal:Check wang2020 adolescent lung for vessel_size_gradient
'''

In [None]:
import scanpy as sc
import scanpy.external as sce
import os 
import pandas as pd 
import numpy as np
import seaborn as sns
from functions import compare_obs_values_within_groups_to_excel
import matplotlib.pyplot as plt
import palantir
# from statannotations.Annotator import Annotator

adata_name='venous_ec'
figures = "data/figures/figures/bhattacharya2024"
data = "data/single_cell_files/scanpy_files"

os.makedirs(figures, exist_ok=True)
sc.set_figure_params(dpi_save=300, fontsize=10, figsize=(1.5,1.5))
sc.settings.figdir = figures
sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})
plt.rcParams["font.family"] = "Arial"
size=15

In [None]:
## Human dataset
data_fol = '/home/carsten/alvira_bioinformatics/data/external_datasets/'
human_adata = sc.read(f'{data_fol}/bhattacharya_2024.h5ad')
human_adata.var_names = human_adata.var['feature_name']
del human_adata.raw
human_adata

In [None]:
sc.pp.highly_variable_genes(human_adata, batch_key='Sample')
sc.pp.pca(human_adata)
sce.pp.harmony_integrate(human_adata, 'Sample', adjusted_basis='X_pca')
sc.pp.neighbors(human_adata)
sc.tl.leiden(human_adata)
sc.tl.umap(human_adata)

sc.pl.umap(human_adata, color='leiden', use_raw=False, save='bhattacharya_leiden.png',show=False)

sc.pl.umap(human_adata, color='cell_type', use_raw=False, save='bhattacharya_old_celltype.png',show=False)

sc.pl.umap(human_adata, color='leiden', use_raw=False, save='bhattacharya_leiden.png',show=False)

leiden_dict = {'0': 'ASM', '1': 'AlF', '2': 'VSM', '3': 'Cap1', '4': 'Per', '5': 'Per', '6': 'Cap2', '7': 'AlM',
               '8': 'AdF', '9': 'Venous EC', '10': 'MyF', '11': 'AEC', '12': 'Nkc',
               '13': 'Arterial EC', '14': 'Tce', '15': 'Bce', '16': 'Lym', '17': 'InM', '18': 'AdF', '19': 'Cil'}
human_adata.obs['Cell Subtype'] = [leiden_dict[x] for x in human_adata.obs['leiden']]
sc.pl.umap(human_adata, color='Cell Subtype', save='bhattacharya_new_celltype.png',show=False)

In [None]:
cts = ['Arterial EC','Cap1','Venous EC']
human_adata = human_adata[human_adata.obs['Cell Subtype'].isin(cts)]
human_adata.uns['Cell Subtype_colors']= ['#4A90E2','#9B59B6','#E35D6A']
sc.pp.highly_variable_genes(human_adata)
sc.pp.pca(human_adata, mask_var="highly_variable")
sc.pp.neighbors(human_adata, use_rep="X_pca")
sc.tl.leiden(human_adata, resolution=0.5)
sc.tl.rank_genes_groups(human_adata,'Cell Subtype',method='wilcoxon',use_raw=False)
sc.tl.umap(human_adata, min_dist=1.5)
sc.pl.rank_genes_groups_dotplot(human_adata,n_genes=10,use_raw=False)
for color in ['Cell Subtype','Sample','leiden','PTPRC','COL1A1','PROX1','APLN','APLNR','GJA5','ACKR1','PLVAP','COL15A1','TBX2','KIT','FBLN5','SCN7A','MECOM','MGP','VWF']:
    sc.pl.umap(human_adata,color=color,use_raw=False)

In [None]:
import palantir
import cellrank as cr
import scvelo as scv

root_ct = 'Cap1'
terminal_cts = ['Arterial EC','Venous EC']
celltype='Cell Subtype'

palantir.utils.run_diffusion_maps(human_adata,
                                           n_components=5)
fig = palantir.plot.plot_diffusion_components(human_adata)[0]
fig.tight_layout()
fig.savefig(f'{figures}/bhattacharyapalantir_diffusion_components.png')
plt.close()
palantir.utils.determine_multiscale_space(human_adata)

palantir.utils.run_magic_imputation(human_adata)
subset = human_adata[human_adata.obs[celltype] == root_ct]
umap1_values = subset.obsm['X_umap'][:, 0]
max_idx = np.argmax(umap1_values)
root_cell = subset.obs_names[max_idx]
terminal_states = []
for ct in terminal_cts:
    subset = human_adata[human_adata.obs[celltype] == ct]
    if ct =='Arterial EC':
        # Get the index (obs_names) of the cell with the min UMAP1 (usually component 0)
        umap1_values = subset.obsm['X_umap'][:, 1]
        min_idx = np.argmin(umap1_values)
        # Return the cell name
        terminal_states.append(subset.obs_names[min_idx])
    else:
        umap1_values = subset.obsm['X_umap'][:, 0]
        min_idx = np.argmin(umap1_values)
        # Return the cell name
        terminal_states.append(subset.obs_names[min_idx])
        
terminal_states = pd.Series(index=terminal_states, data=terminal_cts, dtype='object')

fig = palantir.plot.highlight_cells_on_umap(human_adata, [root_cell]+terminal_states)[0]
fig.tight_layout()
fig.savefig(f'{figures}/bhattacharyapalantir_terminal_cells.png')
plt.close()

palantir.core.run_palantir(
    human_adata, root_cell, num_waypoints=500, terminal_states=terminal_states
)

fig = palantir.plot.plot_palantir_results(human_adata, s=3)
fig.tight_layout()
fig.savefig(f'{figures}/bhattacharyapalantir_results.png')
plt.close()
iroot = human_adata.obs.index.get_loc(root_cell)
human_adata.uns["iroot"] = iroot
sc.tl.dpt(human_adata)

try:
    palantir.presults.select_branch_cells(human_adata, q=.01, eps=.01,pseudo_time_key='dpt_pseudotime')

    fig = palantir.plot.plot_branch_selection(human_adata)
    fig.tight_layout()
    fig.savefig(f'{figures}/bhattacharyapalantir_branch_selection.png')
    plt.close()

except:
    pass

sc.tl.diffmap(human_adata)
scv.pl.scatter(
    human_adata,
    basis="diffmap",
    c=[celltype, iroot],
    legend_loc="right",
    components=["2, 3"],
    show=False,
    save=f'bhattacharyadiffmap_{celltype}_root_cell.png'
)


sc.pl.embedding(
    human_adata,
    basis="umap",
    color=["dpt_pseudotime", "palantir_pseudotime"],
    color_map="viridis",
    show=False,
    save='_bhattacharya_pseudotimes.png'
)

palantir.presults.compute_gene_trends(
    human_adata,
    expression_key="MAGIC_imputed_data",
    pseudo_time_key='dpt_pseudotime'
)

pk = cr.kernels.PseudotimeKernel(human_adata, time_key="palantir_pseudotime")
pk.compute_transition_matrix()
pk.plot_projection(basis="umap", color=celltype, recompute=True,legend_loc='right margin',
                         save=f'{figures}/bhattacharyapalantir_pseudotime_stream.png')


In [None]:
import pandas as pd
from scipy.stats import spearmanr

def correlate_genes_with_pseudotime(adata, layer=None, method='spearman',pseudotime='dpt_pseudotime'):
    """
    Correlates all genes with pseudotime in an AnnData object.

    Parameters:
    - adata: AnnData object with pseudotime in `adata.obs['pseudotime']`
    - layer: (Optional) Layer to use instead of adata.X (e.g., 'log1p', 'counts')
    - method: Correlation method, either 'spearman' (default) or 'pearson'

    Returns:
    - pandas DataFrame with genes as index and columns: ['correlation', 'pval']
    """
    if pseudotime not in adata.obs:
        raise ValueError("Pseudotime must be stored in adata.obs['pseudotime'].")

    # Get expression matrix
    X = adata.X if layer is None else adata.layers[layer]
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X.toarray() if hasattr(X, "toarray") else X,
                         index=adata.obs_names, columns=adata.var_names)

    # Extract pseudotime
    pseudotime = adata.obs[pseudotime]

    # Run correlation
    results = []
    for gene in X.columns:
        if method == 'spearman':
            corr, pval = spearmanr(X[gene], pseudotime)
        elif method == 'pearson':
            corr, pval = X[gene].corr(pseudotime), None  # Pearson p-value not computed here
        else:
            raise ValueError("Method must be 'spearman' or 'pearson'.")
        results.append((gene, corr, pval))

    result_df = pd.DataFrame(results, columns=['gene', 'correlation', 'pval']).set_index('gene')
    return result_df.sort_values('correlation', ascending=False)

In [None]:
corr_dfs = {}
for ct in ['Arterial EC','Venous EC']:
    ct_adata = human_adata[human_adata.obsm['branch_masks'][ct]]
    df = correlate_genes_with_pseudotime(ct_adata,method='pearson',pseudotime='palantir_pseudotime')
    corr_dfs[ct]=df.dropna(how='all')

In [None]:
top_n_genes=50
arterial_large_genes = corr_dfs['Arterial EC'].head(top_n_genes).index.tolist()
venous_large_genes = corr_dfs['Venous EC'].head(top_n_genes).index.tolist()
arterial_small_genes = corr_dfs['Arterial EC'].tail(top_n_genes).index.tolist()[::-1]
venous_small_genes = corr_dfs['Venous EC'].tail(top_n_genes).index.tolist()[::-1]

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2,venn3

# Define your lists


# Create the Venn diagram
venn = venn2([set(arterial_large_genes), set(venous_large_genes)], 
             set_labels=('Arterial', 'Venous'), 
             set_colors=('#4A90E2', '#E35D6A'), 
             alpha=0.7)

# Optional: Customize font size
for text in venn.set_labels:
    text.set_fontsize(12)
for text in venn.subset_labels:
    if text:
        text.set_fontsize(12)

# Show the plot
plt.title("Top 50 genes positively correlated with pseudotime")
plt.savefig(f'{figures}/bhattacharyavenn_diagram_large.png',dpi=300,bbox_inches='tight')
plt.close()

In [None]:

# Create the Venn diagram
venn = venn2([set(arterial_small_genes), set(venous_small_genes)], 
             set_labels=('Arterial', 'Venous'), 
             set_colors=('#4A90E2', '#E35D6A'), 
             alpha=0.7)

# Optional: Customize font size
for text in venn.set_labels:
    text.set_fontsize(12)
for text in venn.subset_labels:
    if text:
        text.set_fontsize(12)

# Show the plot
plt.title("Top 50 genes negatively correlated with pseudotime")
plt.savefig(f'{figures}/bhattacharyavenn_diagram_small.png',dpi=300,bbox_inches='tight')
plt.close()

In [None]:
large_genes = [x for x in arterial_large_genes if x in venous_large_genes]
small_genes = [x for x in arterial_small_genes if x in venous_small_genes]
sc.tl.score_genes(human_adata,large_genes,score_name='large_score')
sc.tl.score_genes(human_adata,small_genes,score_name='small_score')
human_adata.obs['Vessel size score'] = human_adata.obs['large_score'] - human_adata.obs['small_score']


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def normalize_dataframe(df):
    # Initialize the MinMaxScaler
    scaler = MinMaxScaler(feature_range=(-10, 10))
    # Fit the scaler on the data and transform each column
    df_normalized = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
    return df_normalized
human_adata.obs['Vessel size score'] = scaler.fit_transform(human_adata.obs[['Vessel size score']])
human_adata.obs['Vessel size category'] = pd.cut(human_adata.obs['Vessel size score'], bins=4,labels=['capillary','small','medium','large'])
sc.pl.umap(human_adata,color=['Vessel size score'],cmap='Oranges',size=size,frameon=False,save='_bhattacharyavessel_size_score.png')
sc.pl.umap(human_adata,color=['Vessel size category'],cmap='viridis',size=size,frameon=False,save='_bhattacharyavessel_size_category.png')
sc.pl.umap(human_adata,color=['Cell Subtype'],cmap='viridis',size=size,legend_loc='on data',legend_fontsize=10, legend_fontoutline=1,frameon=False,save='bhattacharyacellsubtype.png')
sc.pl.umap(human_adata,color=['MGP'],cmap='viridis',size=size,frameon=False,save='bhattacharyamgp.png')
sc.pl.umap(human_adata,color=['COL4A1'],cmap='viridis',size=size,frameon=False,save='bhattacharyacol4a1.png')
sc.pl.umap(human_adata,color=['COL4A2'],cmap='viridis',size=size,frameon=False,save='bhattacharyacol4a2.png')
sc.pl.umap(human_adata,color=['ELN'],cmap='viridis',size=size,frameon=False,save='bhattacharyaeln.png')


In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,2))
palantir.plot.plot_trajectory(
    human_adata, # your anndata
    "Arterial EC", # the branch to plot
    cell_color="dpt_pseudotime", # the ad.obs colum to color the cells by
    n_arrows=5, # the number of arrow heads along the path
    color='#4A90E2', # the color of the path and arrow heads
    scanpy_kwargs=dict(cmap="viridis",size=size), # arguments passed to scanpy.pl.embedding
    arrowprops=dict(arrowstyle="->,head_length=.25,head_width=.25", lw=2), # appearance of the arrow heads
    lw=2, # thickness of the path
ax=ax
    # pseudotime_interval=(0, .9), # interval of the pseudotime to cover with the path
)
fig.tight_layout()

fig.savefig(f'{figures}/bhattacharyapalantir_art_trajectory.png')
plt.close()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,2))
palantir.plot.plot_trajectory(
    human_adata, # your anndata
    "Venous EC", # the branch to plot
    cell_color="dpt_pseudotime", # the ad.obs colum to color the cells by
    n_arrows=5, # the number of arrow heads along the path
    color='#E35D6A', # the color of the path and arrow heads
    scanpy_kwargs=dict(cmap="viridis",size=size), # arguments passed to scanpy.pl.embedding
    arrowprops=dict(arrowstyle="->,head_length=.25,head_width=.25", lw=2), # appearance of the arrow heads
    lw=2, # thickness of the path
ax=ax
    # pseudotime_interval=(0, .9), # interval of the pseudotime to cover with the path
)
fig.tight_layout()
fig.savefig(f'{figures}/bhattacharyapalantir_ven_trajectory.png')
plt.close()

In [None]:
human_adata.write(f'{figures}/vessel_size.gz.h5ad',compression='gzip')