### Goal: Does down sampling TF/ATAC counts change the differential tests between normoxia and hyperoxia 

#### This is different than the one from July as the normalization happens when making the fragment file


In [None]:
import numpy as np
import pandas as pd
import os
import scanpy as sc
import scanpy.external as sce
import sys
import muon as mu
import muon.atac as ac
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress

figures = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/pilot/231102_downsampling_test2'
sc_file = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files/share'
atac_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac'
rna_dir = '/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/rna'
os.makedirs(figures, exist_ok=True)
sc.set_figure_params(dpi=300, format="png")
sc.settings.figdir = figures


In [None]:
adata_rna = sc.read('/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files/share/p7_multiome_rna_processed.gz.h5ad')
adata_rna.obs_names

In [None]:
sc.pl.violin(adata_rna, ['log10_total_umis','log10_n_genes_by_umis'], stripplot=False,inner='box',groupby='mouse')
sc.pl.violin(adata_rna, ['log10_total_umis','log10_n_genes_by_umis'], stripplot=False,inner='box',groupby='treatment')
print(adata_rna.obs.groupby('treatment')['total_umis'].sum())
print(adata_rna.obs.groupby('treatment')['n_genes_by_umis'].sum())
print(adata_rna.obs.groupby('mouse')['total_umis'].sum())
print(adata_rna.obs.groupby('mouse')['n_genes_by_umis'].sum())

In [None]:
adata_dt = {}
for run in ['230609_aggregate','231101_aggregate_normalized']:
    mudata = mu.read_10x_h5(f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/cellranger_output/{run}/outs/filtered_feature_bc_matrix.h5')
    mouse = []
    sex= []
    for x in mudata.obs_names:
        if x.split('-')[1] == '1':
            mouse.append('nor-1')
            sex.append('F')
        elif x.split('-')[1] == '2':
            mouse.append('nor-2')
            sex.append('F')
        elif x.split('-')[1] == '3':
            mouse.append('nor-3')
            sex.append('M')
        elif x.split('-')[1] == '4':
            mouse.append('hyp-1')
            sex.append('F')
        elif x.split('-')[1] == '5':
            mouse.append('hyp-2')
            sex.append('M')
    mudata.obs['mouse'] = mouse
    mudata.obs['sex'] = sex
    mudata.obs['treatment'] = ['Hyperoxia' if x.split('-')[0].startswith('h') else 'Normoxia' for x in mudata.obs['mouse']]
    mudata.obs['timepoint'] = 'P7'
    atac = mudata.mod['atac']
    atac.obs = mudata.obs
    sc.pp.calculate_qc_metrics(atac, inplace=True)
    sc.pl.violin(atac, ['total_counts','n_genes_by_counts'], stripplot=False,inner='box',groupby='mouse')
    sc.pl.violin(atac, ['log1p_total_counts','log1p_n_genes_by_counts'], stripplot=False,inner='box',groupby='mouse')

    print(atac.obs.groupby('treatment')['total_counts'].sum())
    print(atac.obs.groupby('treatment')['n_genes_by_counts'].mean())
    print(atac.obs.groupby('mouse')['total_counts'].sum())
    print(atac.obs.groupby('mouse')['n_genes_by_counts'].mean())

In [None]:
ct_df_dt={}
for lineage in adata_rna.obs['lineage'].cat.categories:
    lin_cts = adata_rna[adata_rna.obs['lineage']==lineage].obs['celltype'].cat.categories
    for ct in lin_cts:
        ct_df_dt[ct]={}
        try:
            df_sub = pd.read_csv(f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/snapatac2_subsampled/dap_datf/{lineage}/hyperoxia/{ct}_hyperoxia_dap.csv')
            df_all = pd.read_csv(f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/snapatac2_all_frags/dap_datf/{lineage}/hyperoxia/{ct}_hyperoxia_dap.csv')
        except:
            print('NO COMP!!!')
            print(ct)
            print('continuing')
        ct_df_dt[ct]['sub'] = df_sub
        ct_df_dt[ct]['all'] = df_all
        print(ct)
        for df in [df_all, df_sub]:
            conditions = [
                df['adjusted p-value'].le(0.05) & df['log2(fold_change)'].ge(0.5),
                df['adjusted p-value'].le(0.05) & df['log2(fold_change)'].le(-0.5)
            ]
            choices = ['Up','Down']
            df['color'] = np.select(conditions, choices, default='NS')
            df['-Log10(FDR)'] = -np.log10(df['adjusted p-value'])
            print(df['color'].value_counts(normalize=True)*100)
        fig, axs = plt.subplots(1,2,figsize=(4,2),sharey=True)
        axs = axs.ravel()
        axs[0] = sns.scatterplot(x=df_all['log2(fold_change)'],y=df_all['-Log10(FDR)'],hue = df_all['color'], palette=['blue','grey','red'], hue_order=['Down','NS','Up'],s=10,linewidth=0,ax=axs[0])
        axs[0].get_legend().remove()
        axs[0].set_title('All fragments')
        axs[1] = sns.scatterplot(x=df_sub['log2(fold_change)'],y=df_sub['-Log10(FDR)'],hue = df_sub['color'], palette=['blue','grey','red'], hue_order=['Down','NS','Up'],s=10,linewidth=0,ax = axs[1])
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
        axs[1].set_title('Subsampled fragments')
        fig.savefig(f'{figures}/{lineage}_{ct}_volcanos.png',bbox_inches='tight',dpi=300)
        plt.close()

In [None]:
ct_df_dt={}
for lineage in ['endothelial']:
    lin_cts = adata_rna[adata_rna.obs['lineage']==lineage].obs['celltype'].cat.categories
    for ct in lin_cts:
        ct_df_dt[ct]={}
        try:
            df_sub = pd.read_csv(f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/snapatac2_subsampled/dap_datf/{lineage}/hyperoxia/{ct}_hyperoxia_dap.csv')
            df_all = pd.read_csv(f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/snapatac2_all_frags/dap_datf/{lineage}/hyperoxia/{ct}_hyperoxia_dap.csv')
            df_ct_treat = pd.read_csv(f'/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/snapatac2/dap_datf/{lineage}/hyperoxia/{ct}_hyperoxia_dap.csv')
        except:
            print('NO COMP!!!')
            print(ct)
            print('continuing')
        ct_df_dt[ct]['sub'] = df_sub
        ct_df_dt[ct]['all'] = df_all
        print(ct)
        for df in [df_all, df_sub,df_ct_treat]:
            conditions = [
                df['adjusted p-value'].le(0.05) & df['log2(fold_change)'].ge(0.5),
                df['adjusted p-value'].le(0.05) & df['log2(fold_change)'].le(-0.5)
            ]
            choices = ['Up','Down']
            df['color'] = np.select(conditions, choices, default='NS')
            df['-Log10(FDR)'] = -np.log10(df['adjusted p-value'])
            print(df['color'].value_counts(normalize=True)*100)
        fig, axs = plt.subplots(1,3,figsize=(6,4),sharey=True)
        axs = axs.ravel()
        axs[0] = sns.scatterplot(x=df_all['log2(fold_change)'],y=df_all['-Log10(FDR)'],hue = df_all['color'], palette=['blue','grey','red'], hue_order=['Down','NS','Up'],s=10,linewidth=0,ax=axs[0])
        axs[0].get_legend().remove()
        axs[0].set_title('All')
        axs[1] = sns.scatterplot(x=df_sub['log2(fold_change)'],y=df_sub['-Log10(FDR)'],hue = df_sub['color'], palette=['blue','grey','red'], hue_order=['Down','NS','Up'],s=10,linewidth=0,ax = axs[1])
        axs[1].set_title('Subsampled')
        axs[1].get_legend().remove()
        axs[2] = sns.scatterplot(x=df_ct_treat['log2(fold_change)'],y=df_ct_treat['-Log10(FDR)'],hue = df_ct_treat['color'], palette=['blue','grey','red'], hue_order=['Down','NS','Up'],s=10,linewidth=0,ax=axs[2])
        axs[2].get_legend().remove()
        axs[2].set_title('Treatment peaks')

        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
        fig.suptitle(ct)
        fig.tight_layout()
        fig.show()


In [None]:
#### Try subsample using scanpy from the 
adata_peak = sc.read(f"/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files/snapatac2_peak_matrix_all_frags.gz.h5ad")
sc.pp.calculate_qc_metrics(adata_peak, inplace=True)
sc.pl.violin(adata_peak, 'log1p_total_counts', groupby='mouse')
sc.pl.violin(adata_peak, 'n_genes_by_counts', groupby='mouse')
# sc.pp.downsample_counts(adata_peak, counts_per_cell=5000,replace=True)
# sc.pp.calculate_qc_metrics(adata_peak, inplace=True)
# sc.pl.violin(adata_peak, 'log1p_total_counts', groupby='treatment')
# sc.pl.violin(adata_peak, 'n_genes_by_counts', groupby='treatment')
adata_peak.obs.groupby('mouse')['n_genes_by_counts'].sum()
adata_peak.obs.groupby('mouse')['n_genes_by_counts'].sum()


In [None]:
adata_peak.obs.groupby('mouse')['n_genes_by_counts'].sum()


In [None]:
adata_peak.obs.groupby('mouse')['total_counts'].sum()


In [None]:
import snapatac2 as snap
da_output = figures
sc_file = "/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files"
genome = "/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/refdata-cellranger-arc-mm10-2020-A-2.0.0/fasta/genome.fa"
peaks_fn = "/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/snapatac2_all_frags/peaks_df.csv"
peak_md_fn = "/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/figures/atac/snapatac2_all_frags/peak_homer_annotation.txt"
peak_md_cols = [
    "Chr",
    "Start",
    "End",
    "Annotation",
    "Distance to TSS",
    "Nearest PromoterID",
    "Gene Name",
    "Gene Type",
]

print(adata_peak)
peak_md = pd.read_csv(peak_md_fn, sep="\t", index_col=0)
peaks = pd.read_csv(peaks_fn, header=0, index_col=0)
dap_dt = {}
for lineage in sorted(['endothelial']):
    lin_adata = adata_peak[adata_peak.obs["lineage"] == lineage]
    print(lin_adata)
    for ct in sorted(lin_adata.obs["celltype"].unique()):
        ct_adata = lin_adata[lin_adata.obs["celltype"] == ct]
        ct_norm = ct_adata[ct_adata.obs["treatment"] == "Normoxia"]
        ct_hyper = ct_adata[ct_adata.obs["treatment"] == "Hyperoxia"]
        ct_peaks = peaks[peaks[f'{ct}'] == True]
        if len(ct_norm.obs_names) < 10 or len(ct_hyper.obs_names) < 10:
            print(ct)
            print("Too few cells")
            continue
        diff_df = snap.tl.diff_test(
            adata_peak, ct_hyper.obs_names, ct_norm.obs_names,
        )
        diff_df_pd = diff_df.to_pandas()
        dap_dt[ct] = diff_df_pd
     

In [None]:
for ct in sorted(dap_dt.keys()):
    print(ct)
    df = dap_dt[ct]
    conditions = [
        df['adjusted p-value'].le(0.05) & df['log2(fold_change)'].ge(0.5),
        df['adjusted p-value'].le(0.05) & df['log2(fold_change)'].le(-0.5)
    ]
    choices = ['Up','Down']
    df['color'] = np.select(conditions, choices, default='NS')
    df['-Log10(FDR)'] = -np.log10(df['adjusted p-value'])
    print(df['color'].value_counts(normalize=True)*100)
    fig, ax = plt.subplots(1,1,figsize=(4,4))
    ax = sns.scatterplot(x=df['log2(fold_change)'],y=df['-Log10(FDR)'],hue = df['color'], palette=['blue','grey','red'], hue_order=['Down','NS','Up'],s=10,linewidth=0,ax=ax)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
    fig.suptitle(ct)
    fig.show()

In [None]:
adata_tile = sc.read(f"/home/carsten/alvira_bioinformatics/postnatal_lung_multiome/data/single_cell_files/snapatac2_tile_matrix_all_frags.h5ad")
sc.pp.calculate_qc_metrics(adata_tile, inplace=True)


In [None]:
adata_tile.obs.groupby('mouse')['n_genes_by_counts'].sum()


In [None]:
adata_tile.obs.groupby('mouse')['total_counts'].sum()