### Goal: Email request from Cristina on various endothelial cell questions
#### Carsten did a great job creating lists of differentially accessible chromatin regions, but is there any utility to also identify shared regions of accessibility between certain cell types? For example, the veins and the Cap1 seem similar in some ways (including their expression of Peg3, Alpnr, etc.). Would it be meaningful to assess what accessibility they share, that are not shared by other EC?
#### Can we also compare all Peg3 expressing cells versus non-Peg3 expressing cells?
#### Can we look at the proliferating venous EC versus the non-venous proliferating EC?


In [None]:
import numpy as np
import pandas as pd
import os
import scanpy as sc
import scanpy.external as sce
import sys
import muon as mu
import muon.atac as ac
import matplotlib.pyplot as plt
import seaborn as sns
figures = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/pilot/230720_cristina_endo_questions'
sc_file = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files/share'
atac_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac'
rna_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/rna'
os.makedirs(figures, exist_ok=True)
sc.set_figure_params(dpi=300, format="png")
sc.settings.figdir = figures


#### Load in multiomic data

In [None]:
rna = sc.read(f'{sc_file}/p7_multiome_rna_processed.gz.h5ad')
atac = sc.read(f'{sc_file}/p7_multiome_atac_processed.gz.h5ad')
tf = sc.read(f'{sc_file}/p7_multiome_tf_processed.gz.h5ad')

In [None]:
rna.X = rna.layers['soupx'].copy()
sc.pp.normalize_total(rna,target_sum=1e4)
sc.pp.log1p(rna,base=10)

In [None]:
atac.X = atac.layers['counts'].copy()
sc.pp.normalize_total(atac,target_sum=1e4)
sc.pp.log1p(atac,base=10)

In [None]:
atac.var['peak_name'] = [ind if y == 'intergenic' else f'{x}_{y}_{z}' for x,y,z,ind in zip(atac.var['annotated_gene'],
                                                                      atac.var['peak_type'],
                                                                      atac.var['distance'],
                                                                      atac.var.index
                                                                     )
                        ]

In [None]:
tf.X = tf.layers['raw'].copy()
sc.pp.normalize_total(tf,target_sum=1e4)
sc.pp.log1p(tf,base=10)

###  the veins and the Cap1 seem similar in some ways (including their expression of Peg3, Alpnr, etc.). Would it be meaningful to assess what accessibility they share, that are not shared by other EC?

#### Compare the regions that are differentially changed in every comparison for venous and Cap1 vs other endo cell types 
#### All modalities
#### Changes in expression/accesibility must go same direction (both up or both down)

In [None]:
### Look through each cell-cell comparison and make sure it is above a certain level
score_threshold=1
number_cts_threshold = 3

mod_dict = {'deg':rna,
           'datf':tf,
           'dap':atac}
common_feature_dict = {}
for mod in mod_dict.keys():
    if mod in ['dap','datf']:
        cell_type_comparisons =f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/{mod}/endothelial/cell_type_comparisons/normoxia'
    else:
        cell_type_comparisons =f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/rna/{mod}/endothelial/cell_type_comparisons/normoxia'

    adata = mod_dict[mod].copy()
#     sc.pp.scale(adata)
    shared_gene_dict = {}
    for ct in ['Cap2', 'Venous EC']:
        comps = pd.read_excel(f'{cell_type_comparisons}/{ct}.xlsx',
                         index_col=0,
                         header=0,
                         sheet_name=None)
        gene_dict = {}
        for direction in ['up', 'down']:
            gene_dict[direction] = {}
            gene_dict[direction]['celltypes'] ={}
            for ct2 in rna[rna.obs['lineage']=='endothelial'].obs['celltype'].cat.categories:
                if ct2 == 'Proliferating EC':
                    continue
                elif ct == ct2:
                    continue
                df = comps[f'{ct} v {ct2}']
                if direction == 'up':
                    df = df.loc[df['scores']>score_threshold]
                else:
                    df = df.loc[df['scores']<-score_threshold]
                gene_ls = df.index.tolist()
                for gene in gene_ls:
                    if gene in gene_dict[direction]['celltypes'].keys():
                        gene_dict[direction]['celltypes'][gene].append(ct2)
                    else:
                        gene_dict[direction]['celltypes'][gene] = [ct2]

            gene_dict[direction]['number_cts'] = {}
            for gene in gene_dict[direction]['celltypes'].keys():
                gene_dict[direction]['number_cts'][gene] = len(gene_dict[direction]['celltypes'][gene])
        shared_gene_dict[ct] = gene_dict
    common_feature_dict[mod] = gene_dict
    Cap2_up = pd.Series(shared_gene_dict['Cap2']['up']['number_cts']).sort_values(ascending=False)
    Cap2_up = Cap2_up.loc[Cap2_up>=number_cts_threshold]

    Cap2_down = pd.Series(shared_gene_dict['Cap2']['down']['number_cts']).sort_values(ascending=False)
    Cap2_down = Cap2_down.loc[Cap2_down>=number_cts_threshold]

    vec_up = pd.Series(shared_gene_dict['Venous EC']['up']['number_cts']).sort_values(ascending=False)
    vec_up = vec_up.loc[vec_up>=number_cts_threshold]

    vec_down = pd.Series(shared_gene_dict['Venous EC']['down']['number_cts']).sort_values(ascending=False)
    vec_down = vec_down.loc[vec_down>=number_cts_threshold]
    if mod=='dap':
        up = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[[x for x in Cap2_up.index if x in vec_up.index]]
        down = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[[x for x in Cap2_down.index if x in vec_down.index]]
        plot_features=up.head(5).index.tolist() + down.head(5).index.tolist()

    else:
        up = [x for x in Cap2_up.index if x in vec_up.index]
        down = [x for x in Cap2_down.index if x in vec_down.index]
        plot_features = up[:5] + down[:5]

    common_feature_dict[mod] = {'up':up,
                               'down':down}
    adata_norm = adata[(adata.obs['treatment']=='Normoxia')&
                      (adata.obs['lineage']=='endothelial')]
    if mod == 'dap':
        gene_sym = 'peak_name'
        plot_features = atac.var['peak_name'].loc[plot_features].values
    else:
        gene_sym=None
        
    sc.pl.dotplot(adata_norm,
                  plot_features,
                  standard_scale='var',
                  groupby = 'celltype', 
                  use_raw=False,
                  gene_symbols=gene_sym,
                  title=f'Shared {mod}s by Cap2 and VEC')

In [None]:
### Look through each cell-cell comparison and make sure it is above a certain level
score_threshold=1
number_cts_threshold = 3

mod_dict = {'deg':rna,
           'datf':tf,
           'dap':atac}
common_feature_dict = {}
for mod in mod_dict.keys():
    if mod in ['dap','datf']:
        cell_type_comparisons =f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/{mod}/endothelial/cell_type_comparisons/normoxia'
    else:
        cell_type_comparisons =f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/rna/{mod}/endothelial/cell_type_comparisons/normoxia'

    adata = mod_dict[mod].copy()
#     sc.pp.scale(adata)
    shared_gene_dict = {}
    for ct in ['Cap1', 'Venous EC']:
        comps = pd.read_excel(f'{cell_type_comparisons}/{ct}.xlsx',
                         index_col=0,
                         header=0,
                         sheet_name=None)
        gene_dict = {}
        for direction in ['up', 'down']:
            gene_dict[direction] = {}
            gene_dict[direction]['celltypes'] ={}
            for ct2 in rna[rna.obs['lineage']=='endothelial'].obs['celltype'].cat.categories:
                if ct2 == 'Proliferating EC':
                    continue
                elif ct == ct2:
                    continue
                df = comps[f'{ct} v {ct2}']
                if direction == 'up':
                    df = df.loc[df['scores']>score_threshold]
                else:
                    df = df.loc[df['scores']<-score_threshold]
                gene_ls = df.index.tolist()
                for gene in gene_ls:
                    if gene in gene_dict[direction]['celltypes'].keys():
                        gene_dict[direction]['celltypes'][gene].append(ct2)
                    else:
                        gene_dict[direction]['celltypes'][gene] = [ct2]

            gene_dict[direction]['number_cts'] = {}
            for gene in gene_dict[direction]['celltypes'].keys():
                gene_dict[direction]['number_cts'][gene] = len(gene_dict[direction]['celltypes'][gene])
        shared_gene_dict[ct] = gene_dict
    common_feature_dict[mod] = gene_dict
    cap1_up = pd.Series(shared_gene_dict['Cap1']['up']['number_cts']).sort_values(ascending=False)
    cap1_up = cap1_up.loc[cap1_up>=number_cts_threshold]

    cap1_down = pd.Series(shared_gene_dict['Cap1']['down']['number_cts']).sort_values(ascending=False)
    cap1_down = cap1_down.loc[cap1_down>=number_cts_threshold]

    vec_up = pd.Series(shared_gene_dict['Venous EC']['up']['number_cts']).sort_values(ascending=False)
    vec_up = vec_up.loc[vec_up>=number_cts_threshold]

    vec_down = pd.Series(shared_gene_dict['Venous EC']['down']['number_cts']).sort_values(ascending=False)
    vec_down = vec_down.loc[vec_down>=number_cts_threshold]
    if mod=='dap':
        up = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[[x for x in cap1_up.index if x in vec_up.index]]
        down = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[[x for x in cap1_down.index if x in vec_down.index]]
        plot_features=up.head(5).index.tolist() + down.head(5).index.tolist()

    else:
        up = [x for x in cap1_up.index if x in vec_up.index]
        down = [x for x in cap1_down.index if x in vec_down.index]
        plot_features = up[:10] + down[:10]

    common_feature_dict[mod] = {'up':up,
                               'down':down}
    adata_norm = adata[(adata.obs['treatment']=='Normoxia')&
                      (adata.obs['lineage']=='endothelial')]
    if mod == 'dap':
        gene_sym = 'peak_name'
        plot_features = atac.var['peak_name'].loc[plot_features].values
    else:
        gene_sym=None
        
    sc.pl.dotplot(adata_norm,
                  plot_features,
                  standard_scale='var',
                  groupby = 'celltype', 
                  use_raw=False,
                  gene_symbols=gene_sym,
                  title=f'Shared {mod}s by Cap1 and VEC')


In [None]:
 sc.pl.dotplot(rna[rna.obs['lineage']=='endothelial'],
                  sorted(common_feature_dict['deg']['up']),
                  standard_scale='var',
                  groupby = 'celltype', 
                  use_raw=False,
                  title=f'Shared {mod}s by Cap1 and VEC')


#### subsample all populations to 100 cells and run Cap1 v rest and Ven v rest and see what comes out

In [None]:
# https://github.com/scverse/scanpy/issues/987
def obs_key_wise_subsampling(adata, obs_key, N):
    '''
    Subsample each class to same cell numbers (N). Classes are given by obs_key pointing to categorical in adata.obs.
    '''
    counts = adata.obs[obs_key].value_counts()
    # subsample indices per group defined by obs_key
    indices = [np.random.choice(adata.obs_names[adata.obs[obs_key]==group], size=N, replace=True) for group in counts.index]
    selection = np.hstack(np.array(indices))
    return adata[selection].copy()

In [None]:
celltypes = ['Cap1', 'Venous EC']

mod_dict = {'deg':rna,
           'datf':tf,
           'dap':atac}
mod_df_dict = {}
for mod in mod_dict.keys():
    adata = mod_dict[mod].copy()
    adata_norm = adata[(adata.obs['treatment']=='Normoxia')&
                      (adata.obs['lineage']=='endothelial')]
    subsampled = obs_key_wise_subsampling(adata_norm, 'celltype',100)
    df = pd.DataFrame(index= adata_norm.var_names,
                     columns = celltypes,
                     data= None)
    for ct in celltypes:
        ## Drop cell types that are similar
        if ct =='Cap1':
            subsampled = subsampled[~subsampled.obs['celltype'].isin(['Venous Ec','Proliferating EC','Intermediate cap'])]
        else:
            subsampled = subsampled[~subsampled.obs['celltype'].isin(['Cap1','Proliferating EC','Intermediate cap'])]
        sc.tl.rank_genes_groups(
            subsampled,
            "celltype",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_celltype",
        )

        ct_df= sc.get.rank_genes_groups_df(
                            subsampled, key="rank_peaks_groups_celltype",group=ct
                        )
        ct_df.index = ct_df['names']

        df[ct] = ct_df['scores']

    mod_df_dict[mod]=df
        
        
    


In [None]:
mod_df_dict['deg']

In [None]:
mod_df_dict['deg'].loc['Epas1']

In [None]:
mod = 'deg'
score = 3
df = mod_df_dict[mod]

sns.scatterplot(data = df,
           x= 'Venous EC',
           y='Cap1',
           linewidth=0
           )
df = df.loc[((df['Cap1']>score) & (df['Venous EC']>score))|
            ((df['Cap1']<-score) & ((df['Venous EC']<-score)))]
sc.pl.dotplot(rna[rna.obs['lineage']=='endothelial'],
              df.sort_values('Venous EC',ascending=False).index,
              standard_scale='var',
              groupby = 'celltype', 
              use_raw=False,
              title=f'Shared {mod}s by Cap1 and VEC')
df.sort_values('Venous EC',ascending=False).to_csv(f'{figures}/cap1_vec_shared_deg.csv')

In [None]:
sc.pl.dotplot(rna[rna.obs['lineage']=='endothelial'],
              ['Kcnh1','Peg3','Aplnr','Tek','Syne1','Timp3'],
#               standard_scale='var',
              groupby = 'celltype', 
              use_raw=False,
              )

In [None]:
mod = 'dap'
score = 1.5
df = mod_df_dict[mod]

sns.scatterplot(data = df,
           x= 'Venous EC',
           y='Cap1',
           linewidth=0
           )
df = df.loc[((df['Cap1']>score) & (df['Venous EC']>score))|
            ((df['Cap1']<-score) & ((df['Venous EC']<-score)))]
plot_genes = atac.var['peak_name'].loc[df.index].values
sc.pl.dotplot(atac[atac.obs['lineage']=='endothelial'],
              df.sort_values('Venous EC',ascending=False).index,
              standard_scale='var',
              groupby = 'celltype', 
              title=f'Shared {mod}s by Cap1 and VEC')
df
atac_df = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[df.index]
atac_df[df.columns] = df
atac_df.to_csv(f'{figures}/cap1_vec_shared_dap.csv')
atac_df.loc[atac_df['annotated_gene']=='Kcnh1']

In [None]:
mod = 'datf'
score = 3
df = mod_df_dict[mod]

sns.scatterplot(data = df,
           x= 'Venous EC',
           y='Cap1',
           linewidth=0
           )
df = df.loc[((df['Cap1']>score) & (df['Venous EC']>score))|
            ((df['Cap1']<-score) & ((df['Venous EC']<-score)))]
sc.pl.dotplot(tf[tf.obs['lineage']=='endothelial'],
              df.sort_values('Venous EC',ascending=False).index,
              standard_scale='var',
              groupby = 'celltype', 
              title=f'Shared {mod}s by Cap1 and VEC')
df.to_csv(f'{figures}/cap1_vec_shared_datf.csv')

df


#### Can we also compare all Peg3 expressing cells versus non-Peg3 expressing cells?

In [None]:
## Worst code ever lol
rna_norm = rna[rna.obs['treatment']=='Normoxia']
atac_norm = atac[atac.obs['treatment']=='Normoxia']
tf_norm = tf[tf.obs['treatment']=='Normoxia']

rna_cap1 = rna_norm[rna_norm.obs['celltype']=='Cap1']
rna_vec = rna_norm[rna_norm.obs['celltype']=='Venous EC']

atac_cap1 = atac_norm[atac_norm.obs['celltype']=='Cap1']
atac_vec = atac_norm[atac_norm.obs['celltype']=='Venous EC']

tf_cap1 = tf_norm[atac_norm.obs['celltype']=='Cap1']
tf_vec = tf_norm[atac_norm.obs['celltype']=='Venous EC']

In [None]:
sc.pl.violin(rna_norm[rna_norm.obs['lineage']=='endothelial'],
             'Peg3',
            groupby = 'celltype_abv')
sc.pl.dotplot(rna_norm[rna_norm.obs['lineage']=='endothelial'],
             'Peg3',
            groupby = 'celltype_abv')

In [None]:
for adata in [rna_cap1, rna_vec]:
    peg3 = sc.get.obs_df(adata,['Peg3'])
    cat = []
    for x in peg3.values:
        if x>0.75:
            cat.append('high')
        elif x <0.75 and x >0:
            cat.append('med')
        else:
            cat.append('low')
    adata.obs['Peg3_cat'] = cat
    adata.obs['Peg3_high'] = peg3.gt(0.75)
    adata.obs['Peg3_high'].replace({True:'Peg3hi',
                                    False:'Peg3-'},
                                   inplace=True)

In [None]:
peg3.quantile(0.25)

In [None]:
atac_cap1.obs['Peg3_cat'] = rna_cap1[[x for x in rna_cap1.obs.index if x in atac_cap1.obs.index]].obs['Peg3_cat']
atac_vec.obs['Peg3_cat'] = rna_vec[[x for x in rna_vec.obs.index if x in atac_vec.obs.index]].obs['Peg3_cat']
atac_cap1_drop = atac_cap1[atac_cap1.obs['Peg3_cat']!='med']
atac_vec_drop = atac_vec[atac_vec.obs['Peg3_cat']!='med']

sc.tl.rank_genes_groups(
        atac_cap1_drop,
        "Peg3_cat",
        method="wilcoxon",
        pts=True,
        key_added="rank_peaks_groups_Peg3_cat",
    )
sc.tl.rank_genes_groups(
        atac_vec_drop,
        "Peg3_cat",
        method="wilcoxon",
        pts=True,
        key_added="rank_peaks_groups_Peg3_cat",
    )


cap1_df= sc.get.rank_genes_groups_df(
                    atac_cap1_drop, key="rank_peaks_groups_Peg3_cat",group='high'
                )
cap1_df.index = cap1_df['names']
vec_df= sc.get.rank_genes_groups_df(
                    atac_vec_drop, key="rank_peaks_groups_Peg3_cat",group='high'
                )
vec_df.index = vec_df['names']
score_df = pd.DataFrame(index = vec_df.index, 
                        columns = ['Cap1','VEC'],
                        data=None)
score_df['Cap1'] = cap1_df['scores']
score_df['VEC'] = vec_df['scores']
sns.scatterplot(data = score_df,
               x= 'VEC',
               y='Cap1',
               linewidth=0
              )
score = 1
atac_df = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[score_df.index]
atac_df[score_df.columns] = score_df
atac_df = atac_df.loc[((atac_df['Cap1']>score) & (atac_df['VEC']>score))|
            ((atac_df['Cap1']<-score) & ((atac_df['VEC']<-score)))]
atac_df.to_csv(f'{figures}/peg3_vec_cap1_shared_dap.csv')

In [None]:
score_df

In [None]:
atac_cap1.obs['Peg3_high'] = rna_cap1[[x for x in rna_cap1.obs.index if x in atac_cap1.obs.index]].obs['Peg3_high']
atac_vec.obs['Peg3_high'] = rna_vec[[x for x in rna_vec.obs.index if x in atac_vec.obs.index]].obs['Peg3_high']
for adata in [atac_cap1, atac_vec]:
    sc.tl.rank_genes_groups(
            adata,
            "Peg3_high",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_Peg3_high",
        )

cap1_df= sc.get.rank_genes_groups_df(
                    atac_cap1, key="rank_peaks_groups_Peg3_high",group='Peg3hi'
                )
cap1_df.index = cap1_df['names']
vec_df= sc.get.rank_genes_groups_df(
                    atac_vec, key="rank_peaks_groups_Peg3_high",group='Peg3hi'
                )
vec_df.index = vec_df['names']
score_df = pd.DataFrame(index = vec_df.index, 
                        columns = ['Cap1','VEC'],
                        data=None)
score_df['Cap1'] = cap1_df['scores']
score_df['VEC'] = vec_df['scores']
sns.scatterplot(data = score_df,
               x= 'VEC',
               y='Cap1',
               linewidth=0
               )


In [None]:
score_df.loc[atac.var.loc[atac.var['annotated_gene'].str.contains('Peg3')].index]

In [None]:
up = score_df[(score_df['Cap1']>1)&(score_df['VEC']>1)].sort_values('Cap1', ascending=False)
down = score_df[(score_df['Cap1']<-1)&(score_df['VEC']<-1)].sort_values('Cap1', ascending=True)

atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[up.index]

In [None]:
atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[down.index]

#### TF data now


In [None]:

tf_cap1.obs['Peg3_cat'] = rna_cap1[[x for x in rna_cap1.obs.index if x in tf_cap1.obs.index]].obs['Peg3_cat']
tf_vec.obs['Peg3_cat'] = rna_vec[[x for x in rna_vec.obs.index if x in tf_vec.obs.index]].obs['Peg3_cat']
for adata in [tf_cap1, tf_vec]:
    sc.tl.rank_genes_groups(
            adata,
            "Peg3_cat",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_Peg3_cat",
        )

cap1_df= sc.get.rank_genes_groups_df(
                    tf_cap1, key="rank_peaks_groups_Peg3_cat",group='high'
                )
cap1_df.index = cap1_df['names']
vec_df= sc.get.rank_genes_groups_df(
                    tf_vec, key="rank_peaks_groups_Peg3_cat",group='high'
                )
vec_df.index = vec_df['names']
score_df = pd.DataFrame(index = vec_df.index, 
                        columns = ['Cap1','VEC'],
                        data=None)
score_df['Cap1'] = cap1_df['scores']
score_df['VEC'] = vec_df['scores']
sns.scatterplot(data = score_df,
               x= 'VEC',
               y='Cap1',
               linewidth=0
               )
score = 1
score_df.loc[((score_df['Cap1']>score) & (score_df['VEC']>score))|
            ((score_df['Cap1']<-score) & ((score_df['VEC']<-score)))]
score_df.to_csv(f'{figures}/peg3_vec_cap1_shared_datf.csv')

In [None]:
up = score_df[(score_df['Cap1']>1)&(score_df['VEC']>1)].sort_values('Cap1', ascending=False)
down = score_df[(score_df['Cap1']<-1)&(score_df['VEC']<-1)].sort_values('Cap1', ascending=True)

up

In [None]:
down

In [None]:

rna_cap1.obs['Peg3_cat'] = rna_cap1[[x for x in rna_cap1.obs.index if x in rna_cap1.obs.index]].obs['Peg3_cat']
rna_vec.obs['Peg3_cat'] = rna_vec[[x for x in rna_vec.obs.index if x in rna_vec.obs.index]].obs['Peg3_cat']
for adata in [rna_cap1, rna_vec]:
    sc.tl.rank_genes_groups(
            adata,
            "Peg3_cat",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_Peg3_cat",
        )

cap1_df= sc.get.rank_genes_groups_df(
                    rna_cap1, key="rank_peaks_groups_Peg3_cat",group='high'
                )
cap1_df.index = cap1_df['names']
vec_df= sc.get.rank_genes_groups_df(
                    rna_vec, key="rank_peaks_groups_Peg3_cat",group='high'
                )
vec_df.index = vec_df['names']
score_df = pd.DataFrame(index = vec_df.index, 
                        columns = ['Cap1','VEC'],
                        data=None)
score_df['Cap1'] = cap1_df['scores']
score_df['VEC'] = vec_df['scores']
score_df = score_df.loc[~score_df.index.isin(['Peg3'])]
sns.scatterplot(data = score_df,
               x= 'VEC',
               y='Cap1',
               linewidth=0
               )
score_df.sort_values('VEC')
score = 2
score_df.loc[((score_df['Cap1']>score) & (score_df['VEC']>score))|
            ((score_df['Cap1']<-score) & ((score_df['VEC']<-score)))]
score_df.to_csv(f'{figures}/peg3_vec_cap1_shared_deg.csv')

#### Can we look at the proliferating venous EC versus the non-venous proliferating EC?


In [None]:
prolif_ec = rna[rna.obs['celltype']=='Proliferating EC'].copy()
sc.pp.highly_variable_genes(prolif_ec, n_top_genes=2000, batch_key="mouse")
sc.pp.pca(prolif_ec, use_highly_variable=True)
sce.pp.harmony_integrate(prolif_ec, key='mouse', max_iter_harmony=50)
sc.pp.neighbors(prolif_ec, use_rep='X_pca_harmony')
sc.tl.leiden(
    prolif_ec,
    key_added=f"leiden_prolif_ec",
)
sc.tl.umap(prolif_ec, min_dist=0.5)
sc.pl.umap(prolif_ec, color=['Kit','Car4','Gja5','Car8','Slc6a2','Ccl21a','leiden_prolif_ec', 'log10_total_umis','mouse'])

In [None]:
prolif_ec

In [None]:
prolif_ec.obs.groupby('leiden_prolif_ec')['mouse'].value_counts()

In [None]:
sc.tl.rank_genes_groups(prolif_ec, "leiden_prolif_ec", method="wilcoxon")
sc.pl.rank_genes_groups_dotplot(
    prolif_ec,
    groupby="leiden_prolif_ec",
    dendrogram=False,
    n_genes=int(50 / len(prolif_ec.obs["leiden_prolif_ec"].unique())),
)
sc.pl.dotplot(prolif_ec,['Car8','Mmp16','Slc6a2'], groupby='leiden_prolif_ec')

In [None]:
pvecs = prolif_ec[prolif_ec.obs['leiden_prolif_ec'].isin(['9'])].obs_names.tolist()
atac.obs['celltype_pvec'] = ['PVEC' if x in pvecs else y for x,y in zip(atac.obs_names, atac.obs['celltype'])]
tf.obs['celltype_pvec'] = ['PVEC' if x in pvecs else y for x,y in zip(atac.obs_names, atac.obs['celltype'])]
rna.obs['celltype_pvec'] = ['PVEC' if x in pvecs else y for x,y in zip(rna.obs_names, rna.obs['celltype'])]

atac_norm = atac[atac.obs['treatment']=='Hyperoxia']
dap_dict = {}
for ct2 in ['Proliferating EC', 'Venous EC']:
    cts_adata = atac_norm[atac_norm.obs["celltype_pvec"].isin(['PVEC',
                                                          ct2])]
    sc.tl.rank_genes_groups(
        cts_adata,
        "celltype_pvec",
        groups=['PVEC', ct2],
        method="wilcoxon",
        pts=True,
        key_added="rank_peaks_groups_celltype",
    )
    df = sc.get.rank_genes_groups_df(
        cts_adata, key="rank_peaks_groups_celltype", group='PVEC'
    )
    df.index = df['names']
    df[['gene', 'peak_type', 'distance', 'tfs']] = cts_adata.var[['annotated_gene', 'peak_type', 'distance', 'tfs']]
    dap_dict[ct2]=df

score_df2 = pd.DataFrame(index = dap_dict['Venous EC'].names, 
                        columns = ['PEC','VEC'],
                        data=None)
score_df2['PEC'] = dap_dict['Proliferating EC']['scores']
score_df2['VEC'] = dap_dict['Venous EC']['scores']
sns.scatterplot(data = score_df2,
               x= 'VEC',
               y='PEC',
               linewidth=0
               )
score = 2
score_df2 = score_df2.loc[((score_df2['PEC']>score) & (score_df2['VEC']>score))|
            ((score_df2['PEC']<-score) & ((score_df2['VEC']<-score)))]
atac_df = atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[score_df2.index]
atac_df['PEC_score'] = score_df2.loc[atac_df.index]['PEC']
atac_df['VEC_score'] = score_df2.loc[atac_df.index]['VEC']
atac_df.to_csv(f'{figures}/proliferating_vec_unique_dap.csv')

In [None]:
from scipy.stats import linregress
linr = linregress(score_df2['VEC'].to_numpy(),
                  score_df2['PEC'].to_numpy())
print(linr)

In [None]:
up = score_df2[(score_df2['PEC']>2)&(score_df2['VEC']>2)].sort_values('PEC', ascending=False)
atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[up.index]

In [None]:
down = score_df2[(score_df2['PEC']<-2)&(score_df2['VEC']<-2)].sort_values('PEC', ascending=True)
atac.var[["annotated_gene", "peak_type", "distance", "tfs"]].loc[down.index]

In [None]:
rna_norm = rna[rna.obs['treatment']=='Hyperoxia']
dap_dict = {}
for ct2 in ['Proliferating EC', 'Venous EC']:
    cts_adata = rna_norm[rna_norm.obs["celltype_pvec"].isin(['PVEC',
                                                          ct2])]
    sc.tl.rank_genes_groups(
        cts_adata,
        "celltype_pvec",
        groups=['PVEC', ct2],
        method="wilcoxon",
        pts=True,
        key_added="rank_peaks_groups_celltype",
    )
    df = sc.get.rank_genes_groups_df(
        cts_adata, key="rank_peaks_groups_celltype", group='PVEC'
    )
    df.index = df['names']
    dap_dict[ct2]=df

score_df2 = pd.DataFrame(index = dap_dict['Venous EC'].names, 
                        columns = ['PEC','VEC'],
                        data=None)
score_df2['PEC'] = dap_dict['Proliferating EC']['scores']
score_df2['VEC'] = dap_dict['Venous EC']['scores']
sns.scatterplot(data = score_df2,
               x= 'VEC',
               y='PEC',
               linewidth=0
               )
score = 2
score_df2 = score_df2.loc[((score_df2['PEC']>score) & (score_df2['VEC']>score))|
            ((score_df2['PEC']<-score) & ((score_df2['VEC']<-score)))]
score_df2.to_csv(f'{figures}/proliferating_vec_unique_deg.csv')

In [None]:
score_df2


### TF

In [None]:
tf_norm = tf[tf.obs['treatment']=='Hyperoxia']
dap_dict = {}
for ct2 in ['Proliferating EC', 'Venous EC']:
    cts_adata = tf[tf.obs["celltype_pvec"].isin(['PVEC',
                                                          ct2])]
    sc.tl.rank_genes_groups(
        cts_adata,
        "celltype_pvec",
        groups=['PVEC', ct2],
        method="wilcoxon",
        pts=True,
        key_added="rank_peaks_groups_celltype",
    )
    df = sc.get.rank_genes_groups_df(
        cts_adata, key="rank_peaks_groups_celltype", group='PVEC'
    )
    df.index = df['names']
    dap_dict[ct2]=df

score_df2 = pd.DataFrame(index = dap_dict['Venous EC'].names, 
                        columns = ['PEC','VEC'],
                        data=None)
score_df2['PEC'] = dap_dict['Proliferating EC']['scores']
score_df2['VEC'] = dap_dict['Venous EC']['scores']
sns.scatterplot(data = score_df2,
               x= 'VEC',
               y='PEC',
               linewidth=0
               )
score = 2
score_df2 = score_df2.loc[((score_df2['PEC']>score) & (score_df2['VEC']>score))|
            ((score_df2['PEC']<-score) & ((score_df2['VEC']<-score)))]
score_df2.to_csv(f'{figures}/proliferating_vec_unique_datf.csv')

In [None]:
up = score_df2[(score_df2['PEC']>2)&(score_df2['VEC']>2)].sort_values('PEC', ascending=False)
up

In [None]:
down = score_df2[(score_df2['PEC']<-2)&(score_df2['VEC']<-2)].sort_values('PEC', ascending=True)
down

In [None]:
score_df2 = pd.DataFrame(index = dap_dict['Venous EC'].names, 
                        columns = ['PEC','VEC'],
                        data=None)
score_df2['PEC'] = dap_dict['Proliferating EC']['scores']
score_df2['VEC'] = dap_dict['Venous EC']['scores']
outlier = score_df2[(score_df2['PEC']<-2)&(score_df2['VEC']>1)].sort_values('PEC', ascending=True)
outlier

In [None]:
sc.pl.embedding(rna[rna.obs['lineage']=='endothelial'],
                basis='X_umap_endothelial',
                color=['celltype_abv', 'Car8','Mmp16','Slc6a2','Gja5','Ednrb'])

In [None]:
hyp_dict = {}
for mod, label in [(rna,'rna'),
                   (tf,'tf'),
                   (atac,'atac')
                  ]:
    hyp_dict[label]={}
    for ct in ['Proliferating EC', 'PVEC']:
        ct_adata = mod[mod.obs['celltype_pvec'].isin([ct])]
        print(ct_adata.obs['treatment'].value_counts())
        sc.tl.rank_genes_groups(
            ct_adata,
            "treatment",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_treatment",
        )
        df = sc.get.rank_genes_groups_df(
            ct_adata, key="rank_peaks_groups_treatment", group='Hyperoxia'
        )
        df.index = df['names']
        hyp_dict[label][ct] = df
    score_df2 = pd.DataFrame(index = hyp_dict[label][ct].names, 
                        columns = ['PEC','PVEC'],
                        data=None)
    score_df2['PEC'] = hyp_dict[label]['Proliferating EC']['scores']
    score_df2['PVEC'] = hyp_dict[label]['PVEC']['scores']
    hyp_dict[label]['df'] = score_df2
    sns.scatterplot(data = score_df2,
               x= 'PVEC',
               y='PEC',
               linewidth=0
               )
    plt.title(label
             )
    plt.show()
    plt.close()
    

In [None]:
hyp_dict['atac']['df'].sort_values('PEC').head(2)

In [None]:
atac.var.loc[hyp_dict['atac']['df'].sort_values('PEC').head(2).index]


In [None]:
hyp_dict['atac']['df'].sort_values('PEC',ascending=False).head(5)

In [None]:
atac.var.loc[hyp_dict['atac']['df'].sort_values('PEC',ascending=False).head(5).index]


In [None]:
pd.set_option('display.max_rows', 500)
rna.obs.groupby('treatment')['celltype_pvec'].value_counts(normalize=True)*100

In [None]:
pd.set_option('display.max_rows', 500)
rna[rna.obs['lineage']=='endothelial'].obs.groupby('treatment')['celltype_pvec'].value_counts(normalize=True)*100

In [None]:
tf_corr = tf.to_df().corr(method='spearman')

In [None]:
fam='fox'
tf_corr[[x for x in tf_corr.columns if x.lower().startswith(fam)]].loc[[x for x in tf_corr.columns if x.lower().startswith(fam)]]

In [None]:
fam = 'Alx'
tf_corr[[x for x in tf_corr.columns if x.startswith(fam)]].loc[[x for x in tf_corr.columns if x.startswith(fam)]]

In [None]:
meis = tf_corr.loc[tf_corr.index.str.startswith('MEIS')].index.tolist()
tf_corr.loc[meis][meis]

In [None]:
fam='fox'
df = tf_corr[[x for x in tf_corr.columns if x.lower().startswith(fam)]].loc[[x for x in tf_corr.columns if x.lower().startswith(fam)]]
for col in df.columns:
    sub = df[col].sort_values(ascending=False)
    print(sub.head(5))