### Goal: Does down sampling TF counts change the differential tests between normoxia and hyperoxia 


In [None]:
import numpy as np
import pandas as pd
import os
import scanpy as sc
import scanpy.external as sce
import sys
import muon as mu
import muon.atac as ac
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress

figures = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/pilot/230724_downsampling_test'
sc_file = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files/share'
atac_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac'
rna_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/rna'
os.makedirs(figures, exist_ok=True)
sc.set_figure_params(dpi=300, format="png")
sc.settings.figdir = figures


#### Load in Tf data and run on  un-downsampled data

In [None]:
tf = sc.read(f'{sc_file}/p7_multiome_tf_processed.gz.h5ad')

sc.pp.calculate_qc_metrics(tf, inplace=True)
print(tf.obs.groupby('treatment')['total_counts'].mean())
sc.pl.violin(tf, 'log1p_total_counts', groupby='treatment')
sc.pl.violin(tf, 'n_genes_by_counts', groupby='treatment')

sc.pp.normalize_total(tf, target_sum=1e4)
sc.pp.log1p(tf, base=10)

In [None]:
tf_endo = tf[tf.obs['lineage']=='endothelial']
hyp_deg_dict = {}
for ct in tf_endo.obs["celltype"].cat.categories:
    ct_adata_tf = tf_endo[tf_endo.obs["celltype"] == ct]
    try:
        sc.tl.rank_genes_groups(
            ct_adata_tf,
            "treatment",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_treatment",
        )
        df = sc.get.rank_genes_groups_df(
            ct_adata_tf,
            key="rank_peaks_groups_treatment",
            group="Hyperoxia",
        )
        df.index = df['names']
        hyp_deg_dict[ct]=df
    except:
        print(ct)
        print('no hyperoxia comparison')
        continue

#### Load in Tf data and run hyperoxia datf on downsampled data

In [None]:
tf = sc.read(f'{sc_file}/p7_multiome_tf_processed.gz.h5ad')
sc.pp.calculate_qc_metrics(tf, inplace=True)


In [None]:
sc.pp.downsample_counts(tf, counts_per_cell=10000,replace=True)
sc.pp.calculate_qc_metrics(tf, inplace=True)
sc.pl.violin(tf, 'log1p_total_counts', groupby='treatment')
sc.pl.violin(tf, 'n_genes_by_counts', groupby='treatment')
sc.pp.normalize_total(tf, target_sum=1e4)
sc.pp.log1p(tf, base=10)

In [None]:
tf_endo = tf[tf.obs['lineage']=='endothelial']
hyp_deg_dict_ds = {}
for ct in tf_endo.obs["celltype"].cat.categories:
    ct_adata_tf = tf_endo[tf_endo.obs["celltype"] == ct]
    try:
        sc.tl.rank_genes_groups(
            ct_adata_tf,
            "treatment",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_treatment",
        )
        df = sc.get.rank_genes_groups_df(
            ct_adata_tf,
            key="rank_peaks_groups_treatment",
            group="Hyperoxia",
        )
        df.index = df['names']
        hyp_deg_dict_ds[ct]=df
    except:
        print(ct)
        print('no hyperoxia comparison')
        continue

### Calculate regression for each ct and plot scores

In [None]:
for ct in tf_endo.obs["celltype"].cat.categories:
    print(ct)
    score_df = pd.DataFrame(index = hyp_deg_dict[ct].index, 
                        columns = ['normal','downsampled'],
                        data=None)
    score_df['normal'] = hyp_deg_dict[ct]['scores']
    score_df['downsampled'] = hyp_deg_dict_ds[ct]['scores']
    linr = linregress(score_df['normal'].to_numpy(),
                      score_df['downsampled'].to_numpy())
    print(linr)
    sns.scatterplot(x=score_df['normal'], y=score_df['downsampled'], alpha=0.3, linewidth=0)
    plt.xticks([-15,-10,-5,0,5,10,15])
    plt.yticks([-15,-10,-5,0,5,10,15])

    

In [None]:
score_df

# ATAC

#### Load in atac data and run on  un-downsampled data

In [None]:
atac = sc.read(f'{sc_file}/p7_multiome_atac_processed.gz.h5ad')

sc.pp.calculate_qc_metrics(atac, inplace=True)
print(atac.obs.groupby('treatment')['total_counts'].mean())
sc.pl.violin(atac, 'log1p_total_counts', groupby='treatment')
sc.pl.violin(atac, 'log1p_n_genes_by_counts', groupby='treatment')
sc.pp.normalize_total(atac, target_sum=1e4)
sc.pp.log1p(atac, base=10)

In [None]:
atac

In [None]:
atac_endo = atac[atac.obs['lineage']=='endothelial']
hyp_deg_dict = {}
for ct in atac_endo.obs["celltype"].cat.categories:
    ct_adata_atac = atac_endo[atac_endo.obs["celltype"] == ct]
    try:
        sc.tl.rank_genes_groups(
            ct_adata_atac,
            "treatment",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_treatment",
        )
        df = sc.get.rank_genes_groups_df(
            ct_adata_atac,
            key="rank_peaks_groups_treatment",
            group="Hyperoxia",
        )
        df.index = df['names']
        hyp_deg_dict[ct]=df
    except:
        print(ct)
        print('no hyperoxia comparison')
        continue

#### Load in atac data and run hyperoxia dap on downsampled data

In [None]:
atac = sc.read(f'{sc_file}/p7_multiome_atac_processed.gz.h5ad')
sc.pp.calculate_qc_metrics(atac, inplace=True)


In [None]:
sc.pp.downsample_counts(atac, counts_per_cell=8000,replace=True)
sc.pp.normalize_total(atac, target_sum=1e4)
sc.pp.log1p(atac, base=10)

In [None]:
atac_endo = atac[atac.obs['lineage']=='endothelial']
hyp_deg_dict_ds = {}
for ct in atac_endo.obs["celltype"].cat.categories:
    ct_adata_atac = atac_endo[atac_endo.obs["celltype"] == ct]
    try:
        sc.tl.rank_genes_groups(
            ct_adata_atac,
            "treatment",
            method="wilcoxon",
            pts=True,
            key_added="rank_peaks_groups_treatment",
        )
        df = sc.get.rank_genes_groups_df(
            ct_adata_atac,
            key="rank_peaks_groups_treatment",
            group="Hyperoxia",
        )
        df.index = df['names']
        hyp_deg_dict_ds[ct]=df
    except:
        print(ct)
        print('no hyperoxia comparison')
        continue

### Calculate regression for each ct and plot scores

In [None]:
for ct in atac_endo.obs["celltype"].cat.categories:
    print(ct)
    score_df = pd.DataFrame(index = hyp_deg_dict[ct].index, 
                        columns = ['normal','downsampled'],
                        data=None)
    score_df['normal'] = hyp_deg_dict[ct]['scores']
    score_df['downsampled'] = hyp_deg_dict_ds[ct]['scores']
    linr = linregress(score_df['normal'].to_numpy(),
                      score_df['downsampled'].to_numpy())
    print(linr)
    sns.scatterplot(x=score_df['normal'], y=score_df['downsampled'], alpha=0.3, linewidth=0)
plt.xticks([-15,-10,-5,0,5,10,15])
plt.yticks([-15,-10,-5,0,5,10,15 ])