In [None]:
'''
Goal: Pilot psuedobulk for uterus snRNA-seq 
Author:Carsten Knutsen
Date:231011
conda_env:pseudobulk
Notes: Adapted from decoupler tutorial https://decoupler-py.readthedocs.io/en/latest/notebooks/pseudobulk.html
'''

In [None]:
import scanpy as sc
import decoupler as dc

# Only needed for processing
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

# Needed for some plotting
import matplotlib.pyplot as plt
import os
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

output = '/home/carsten/alvira_bioinformatics/uterus/data/pilot/231011_pseudobulk_pilot'
os.makedirs(output, exist_ok=True)

In [None]:
adata = sc.read('/home/carsten/alvira_bioinformatics/uterus/data/single_cell_files/scanpy_files/uterus_processed_celltyped.gz.h5ad')
adata = adata[~adata.obs['Cell Subtype'].isna()]
adata.obs['Contractility'] = adata.obs['Contractility'].cat.add_categories(['ND']).fillna("ND")
adata.obs['GroupContract'] = adata.obs['Group'].astype('str') + '-' + adata.obs['Contractility'].astype('str')

In [None]:
sc.pl.dotplot(adata[adata.obs['Cell Subtype']=='Artery'], 
              ['ECHDC2','ID3','CXCL2'], 
              use_raw=False,
              groupby='GroupContract')

In [None]:
adata

In [None]:
adata.obs.groupby('Cell Subtype')['Patient'].value_counts()

In [None]:
pdata = dc.get_pseudobulk(
    adata,
    sample_col='Patient',
    groups_col='Cell Subtype',
    layer='soupx',
    mode='sum',
    min_cells=0,
    min_counts=0
)
pdata

In [None]:
dc.plot_psbulk_samples(pdata, groupby=['Patient', 'Cell Subtype'], figsize=(11, 3))

In [None]:
pp_adata = pdata.copy()
sc.pp.normalize_total(pp_adata, target_sum=1e6)
sc.pp.log1p(pp_adata)
sc.pp.scale(pp_adata, max_value=10)
sc.tl.pca(pp_adata, n_comps=10)
sc.pp.neighbors(pp_adata)
sc.tl.umap(pp_adata, min_dist=0.1)

In [None]:
sc.pl.pca(pp_adata, color=['Lineage', 'Cell Subtype'], ncols=2, show=True, size=300)
sc.pl.umap(pp_adata, color=['Lineage', 'Cell Subtype'], ncols=2, show=True, size=300)


In [None]:
pp_adata.obs['celltype'] = pp_adata.obs['Cell Subtype']
dc.get_metadata_associations(
    pp_adata,
    obs_keys = ['Group', 'Contractility', 'Term', 'Labor','Lineage','celltype', 'psbulk_n_cells', 'psbulk_counts'], #metadata columns to associate to PCs
    obsm_key='X_pca',  # where the PCs are stored
    uns_key='pca_anova',  # where the results are stored
    inplace=True
)

In [None]:
# plt.figure(figsize=(7,10))
# ax, legend_axes = dc.plot_associations(
#     pp_adata,
#     uns_key='pca_anova',  # summary statistics from the anova tests
#     obsm_key='X_pca',  # where the PCs are stored
#     stat_col='p_adj',  # which summary statistic to plot
#     obs_annotation_cols = ['Cell Subtype'], # which sample annotations to plot
#     titles=['Adjusted p-values from ANOVA', 'Principle component scores']
# )
# plt.show()

In [None]:
'''
Term_Non-Laboring_Good_Contractility vs. Term_Non-Laboring_Bad_Contractility
Term_Non-Laboring_Good_Contractility vs. Preterm_Non-Laboring

Term_Non-Laboring_Good_Contractility vs. Term_Laboring
'''

In [None]:
comparison_dictionary = {'TNL_GC_v_TNL_BC':['TNL-BC','TNL-GC'],
                         'TNL_GC_v_PNL_ND':['PNL-ND', 'TNL-GC'],
                         'TNL_GC_v_TL_ND':['TL-ND', 'TNL-GC']
                        }
comp_dict={}
for key in comparison_dictionary.keys():
    comp_list = comparison_dictionary[key]
    print(comp_list)
    compare_pseudo = pdata[pdata.obs['GroupContract'].isin(comp_list)]
    print(compare_pseudo)
    ct_dict = {}
    for ct in adata.obs['Cell Subtype'].unique():
        ct_adata = compare_pseudo[compare_pseudo.obs['Cell Subtype'] == ct]
        print(ct_adata)
        genes = dc.filter_by_expr(ct_adata, group='GroupContract', min_count=10, min_total_count=15)
        ct_adata = ct_adata[:, genes].copy()
        if len(genes)<100:
            print(ct)
            print('NOT ENOUGH GENES')
            continue
        print(ct_adata)
        dds = DeseqDataSet(
        adata=ct_adata,
        design_factors='GroupContract',
        ref_level=['GroupContract', sorted(ct_adata.obs['GroupContract'].unique())[0]],
        refit_cooks=True,
        n_cpus=8,
    )
        dds.deseq2()
        contrast = ['GroupContract']+sorted(ct_adata.obs['GroupContract'].unique())
        coeff = f'GroupContract_{contrast[-1]}_vs_{contrast[-2]}'
        stat_res = DeseqStats(dds, contrast=contrast, n_cpus=8)
        stat_res.summary()
        stat_res.lfc_shrink(coeff=coeff)
        results_df = stat_res.results_df
        ct_dict[ct] = results_df.sort_values('pvalue')
    comp_dict[key] =ct_dict

In [None]:

for key in comp_dict.keys():
    ct_dict = comp_dict[key]
    with pd.ExcelWriter(
                f"{output}/{key}_pseudobulk_comparisons.xlsx", engine="xlsxwriter"
        ) as writer:
        for key2 in sorted(ct_dict.keys()):
            ct_df = ct_dict[key2]
            ct_df.to_excel(writer, sheet_name=f"{key2}"[:31])

In [None]:

sc.pl.dotplot(adata[adata.obs['Cell Subtype']=='Artery'], 
              comp_dict['TNL_GC_v_TNL_BC']['Artery'].loc[comp_dict['TNL_GC_v_TNL_BC']['Artery']['padj']<0.1].head(10).index.tolist(), 
              use_raw=False,
              groupby='GroupContract')