### Goal: Email request from Cristina on various endothelial cell questions
#### Carsten did a great job creating lists of differentially accessible chromatin regions, but is there any utility to also identify shared regions of accessibility between certain cell types? For example, the veins and the Cap1 seem similar in some ways (including their expression of Peg3, Alpnr, etc.). Would it be meaningful to assess what accessibility they share, that are not shared by other EC?
#### Can we also compare all Peg3 expressing cells versus non-Peg3 expressing cells?
#### Can we look at the proliferating venous EC versus the non-venous proliferating EC?


In [None]:
import numpy as np
import pandas as pd
import os
import scanpy as sc
import scanpy.external as sce
import sys
import muon as mu
import muon.atac as ac
import matplotlib.pyplot as plt
import seaborn as sns
figures = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/pilot/230720_cristina_endo_questions'
sc_file = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files/share'
atac_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac'
rna_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/rna'
os.makedirs(figures, exist_ok=True)
sc.set_figure_params(dpi=300, format="png")
sc.settings.figdir = figures


#### Load in multiomic data

In [None]:
rna = sc.read(f'{sc_file}/p7_multiome_rna_processed.gz.h5ad')
atac = sc.read(f'{sc_file}/p7_multiome_atac_processed.gz.h5ad')
tf = sc.read(f'{sc_file}/p7_multiome_tf_processed.gz.h5ad')

In [None]:
atac.X = atac.layers['counts'].copy()
sc.pp.normalize_total(atac,target_sum=1e6)
sc.pp.log1p(atac,base=10)

In [None]:
tf.var

###  the veins and the Cap1 seem similar in some ways (including their expression of Peg3, Alpnr, etc.). Would it be meaningful to assess what accessibility they share, that are not shared by other EC?

#### Compare the regions that are DAPs in every comparison for venous and Cap1 vs rest of endos

In [None]:
### Look through each cell-cell comparison and make sure it is above a certain level
score_threshold=1
number_cts_threshold = 3

mod_dict = {'deg':rna,
           'datf':tf,
           'dap':atac}
common_feature_dict = {}
for mod in mod_dict.keys():
    if mod in ['dap','datf']:
        cell_type_comparisons =f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/{mod}/endothelial/cell_type_comparisons/normoxia'
    else:
        cell_type_comparisons =f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/rna/{mod}/endothelial/cell_type_comparisons/normoxia'

    adata = mod_dict[mod]
    shared_gene_dict = {}
    for ct in ['Cap1', 'Venous EC']:
        comps = pd.read_excel(f'{cell_type_comparisons}/{ct}.xlsx',
                         index_col=0,
                         header=0,
                         sheet_name=None)
        gene_dict = {}
        for direction in ['up', 'down']:
            gene_dict[direction] = {}
            gene_dict[direction]['celltypes'] ={}
            for ct2 in rna[rna.obs['lineage']=='endothelial'].obs['celltype'].cat.categories:
                if ct2 == 'Proliferating EC':
                    continue
                elif ct == ct2:
                    continue
                df = comps[f'{ct} v {ct2}']
                if direction == 'up':
                    df = df.loc[df['scores']>score_threshold]
                else:
                    df = df.loc[df['scores']<-score_threshold]
                gene_ls = df.index.tolist()
                for gene in gene_ls:
                    if gene in gene_dict[direction]['celltypes'].keys():
                        gene_dict[direction]['celltypes'][gene].append(ct2)
                    else:
                        gene_dict[direction]['celltypes'][gene] = [ct2]

            gene_dict[direction]['number_cts'] = {}
            for gene in gene_dict[direction]['celltypes'].keys():
                gene_dict[direction]['number_cts'][gene] = len(gene_dict[direction]['celltypes'][gene])
        shared_gene_dict[ct] = gene_dict
    common_feature_dict[mod] = gene_dict
    cap1_up = pd.Series(shared_gene_dict['Cap1']['up']['number_cts']).sort_values(ascending=False)
    cap1_up = cap1_up.loc[cap1_up>=number_cts_threshold]

    cap1_down = pd.Series(shared_gene_dict['Cap1']['down']['number_cts']).sort_values(ascending=False)
    cap1_down = cap1_down.loc[cap1_down>=number_cts_threshold]

    vec_up = pd.Series(shared_gene_dict['Venous EC']['up']['number_cts']).sort_values(ascending=False)
    vec_up = vec_up.loc[vec_up>=number_cts_threshold]

    vec_down = pd.Series(shared_gene_dict['Venous EC']['down']['number_cts']).sort_values(ascending=False)
    vec_down = vec_down.loc[vec_down>=number_cts_threshold]
    if mod=='dap':
        up = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[[x for x in cap1_up.index if x in vec_up.index]]
        down = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[[x for x in cap1_down.index if x in vec_down.index]]
        plot_features=up.head(5).index.tolist() + down.head(5).index.tolist()

    else:
        up = [x for x in cap1_up.index if x in vec_up.index]
        down = [x for x in cap1_down.index if x in vec_down.index]
        plot_features = up[:5] + down[:5]

    common_feature_dict[mod] = {'up':up,
                               'down':down}
    adata_norm = adata[(adata.obs['treatment']=='Normoxia')&
                      (adata.obs['lineage']=='endothelial')]
    sc.pl.dotplot(adata_norm,
                  plot_features,
                  standard_scale='var',
                  groupby = 'celltype', 
                  title=f'Shared {mod}s by Cap1 and VEC')


In [None]:
sc.pp.calculate_qc_metrics(tf, inplace=True)
sc.pl.violin(tf[tf.obs['lineage']=='endothelial'], 'total_counts', groupby='celltype_abv')

In [None]:
common_feature_dict['dap']['up']

In [None]:

sc.pl.dotplot(atac[atac.obs['lineage']=='endothelial'],
              common_feature_dict['dap']['up'].head(10).index.tolist(),
              groupby = 'celltype')

In [None]:
len(common_feature_dict['deg']['down'])

In [None]:
common_feature_dict['dap']['up'].loc[common_feature_dict['dap']['up']['annotated_gene'].str.contains('Peg3')]

In [None]:
common_feature_dict['dap']['down']

In [None]:
atac.var.loc[atac.var['annotated_gene'].str.contains('Peg3')]

#### Can we also compare all Peg3 expressing cells versus non-Peg3 expressing cells?

In [None]:
rna_norm = rna[rna.obs['treatment']=='Normoxia']
atac_norm = atac[atac.obs['treatment']=='Normoxia']
rna_cap1 = rna_norm[rna_norm.obs['celltype']=='Cap1']
rna_vec = rna_norm[rna_norm.obs['celltype']=='Venous EC']
atac_cap1 = atac_norm[atac_norm.obs['celltype']=='Cap1']
atac_vec = atac_norm[atac_norm.obs['celltype']=='Venous EC']

In [None]:
sc.pl.violin(rna_norm[rna_norm.obs['lineage']=='endothelial'],
             'Peg3',
            groupby = 'celltype_abv')
sc.pl.dotplot(rna_norm[rna_norm.obs['lineage']=='endothelial'],
             'Peg3',
            groupby = 'celltype_abv')

In [None]:
for adata in [rna_cap1, rna_vec]:
    peg3 = sc.get.obs_df(adata,['Peg3'])
    adata.obs['Peg3_high'] = peg3.gt(3)
    adata.obs['Peg3_high'].replace({True:'Peg3hi',
                                    False:'Peg3-'},
                                   inplace=True)
atac_cap1.obs['Peg3_high'] = rna_cap1[[x for x in rna_cap1.obs.index if x in atac_cap1.obs.index]].obs['Peg3_high']
atac_vec.obs['Peg3_high'] = rna_vec[[x for x in rna_vec.obs.index if x in atac_vec.obs.index]].obs['Peg3_high']
for adata in [atac_cap1, atac_vec]:
    sc.tl.rank_genes_groups(
            adata,
            "Peg3_high",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_Peg3_high",
        )

In [None]:
cap1_df= sc.get.rank_genes_groups_df(
                    atac_cap1, key="rank_peaks_groups_Peg3_high",group='Peg3hi'
                )
cap1_df.index = cap1_df['names']
vec_df= sc.get.rank_genes_groups_df(
                    atac_vec, key="rank_peaks_groups_Peg3_high",group='Peg3hi'
                )
vec_df.index = vec_df['names']
score_df = pd.DataFrame(index = vec_df.index, 
                        columns = ['Cap1','VEC'],
                        data=None)
score_df['Cap1'] = cap1_df['scores']
score_df['VEC'] = vec_df['scores']

In [None]:
atac.var.loc[atac.var['annotated_gene'].str.contains('Peg3')].index

In [None]:
score_df.loc[atac.var.loc[atac.var['annotated_gene'].str.contains('Peg3')].index]

In [None]:
up = score_df[(score_df['Cap1']>1)&(score_df['VEC']>1)].sort_values('Cap1', ascending=False)
down = score_df[(score_df['Cap1']<-1)&(score_df['VEC']<-1)].sort_values('Cap1', ascending=True)

In [None]:
atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[up.index]

In [None]:
atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[down.index]

In [None]:
sns.scatterplot(data = score_df,
               x= 'VEC',
               y='Cap1',
               linewidth=0
               )

#### Can we look at the proliferating venous EC versus the non-venous proliferating EC?


In [None]:
prolif_ec = rna_norm[rna_norm.obs['celltype']=='Proliferating EC'].copy()
sc.pp.highly_variable_genes(prolif_ec, n_top_genes=2000, batch_key="mouse")
sc.pp.pca(prolif_ec, use_highly_variable=True)
sce.pp.harmony_integrate(prolif_ec, key='mouse', max_iter_harmony=20)
sc.pp.neighbors(prolif_ec, use_rep='X_pca_harmony')
sc.tl.leiden(
    prolif_ec,
    key_added=f"leiden_prolif_ec",
)
sc.tl.umap(prolif_ec, min_dist=0.5)
sc.pl.umap(prolif_ec, color=['Kit','Car4','Gja5','Car8','Slc6a2','Ccl21a','leiden_prolif_ec', 'mouse'])

In [None]:
rna

In [None]:
prolif_ec.obs.groupby('leiden_prolif_ec')['mouse'].value_counts()

In [None]:
sc.tl.rank_genes_groups(prolif_ec, "leiden_prolif_ec", method="wilcoxon")
sc.pl.rank_genes_groups_dotplot(
    prolif_ec,
    groupby="leiden_prolif_ec",
    dendrogram=False,
    n_genes=int(50 / len(prolif_ec.obs["leiden_prolif_ec"].unique())),
)
sc.pl.dotplot(prolif_ec,['Car8','Slc6a2'], groupby='leiden_prolif_ec')

In [None]:
pvecs = prolif_ec[prolif_ec.obs['leiden_prolif_ec']=='5'].obs_names.tolist()
atac.obs['celltype_pvec'] = ['PVEC' if x in pvecs else y for x,y in zip(atac.obs_names, atac.obs['celltype'])]
tf.obs['celltype_pvec'] = ['PVEC' if x in pvecs else y for x,y in zip(atac.obs_names, atac.obs['celltype'])]

atac_norm = atac[atac.obs['treatment']=='Normoxia']
dap_dict = {}
for ct2 in ['Proliferating EC', 'Venous EC']:
    cts_adata = atac_norm[atac_norm.obs["celltype_pvec"].isin(['PVEC',
                                                          ct2])]
    sc.tl.rank_genes_groups(
        cts_adata,
        "celltype_pvec",
        groups=['PVEC', ct2],
        method="wilcoxon",
        pts=True,
        key_added="rank_peaks_groups_celltype",
    )
    df = sc.get.rank_genes_groups_df(
        cts_adata, key="rank_peaks_groups_celltype", group='PVEC'
    )
    df.index = df['names']
    df[['gene', 'peak_type', 'distance', 'tfs']] = adata.var[['annotated_gene', 'peak_type', 'distance', 'tfs']]
    dap_dict[ct2]=df

score_df2 = pd.DataFrame(index = dap_dict['Venous EC'].names, 
                        columns = ['PEC','VEC'],
                        data=None)
score_df2['PEC'] = dap_dict['Proliferating EC']['scores']
score_df2['VEC'] = dap_dict['Venous EC']['scores']
sns.scatterplot(data = score_df2,
               x= 'VEC',
               y='PEC',
               linewidth=0
               )

In [None]:
up = score_df2[(score_df2['PEC']>2)&(score_df2['VEC']>2)].sort_values('PEC', ascending=False)
up

In [None]:
atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[up.index]

In [None]:
down = score_df2[(score_df2['PEC']<-2)&(score_df2['VEC']<-2)].sort_values('PEC', ascending=True)
down

In [None]:
atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[down.index]