In [None]:
import os 
import sys
import scanpy as sc

sys.path.append('..')
from constants  import BASE_PATH_DATA, BASE_PATH_EXPERIMENTS
from preprocess_pbmc_helper import preprocess_dataset

In [None]:
plt.rcParams.update(
    {
        "pdf.fonttype": 42,
        "font.family": "sans-serif",
        "font.sans-serif": "Arial",
        "font.size": 10,
    }
)

In [None]:
def load_pbmc_data():
    fn_data = os.path.join(BASE_PATH_DATA, 'raw_data/pbmc_citeseq.h5ad')
    adata = sc.read_h5ad(fn_data)

    adata = adata.raw.to_adata()
    adata.var_names = adata.var['_index']
    adata.var_names.name = None
    adata.var.columns = ['gene_names']

    if 'mt' not in adata.var:
        # get mitochondrial genes
        adata.var["mt"] = adata.var_names.str.startswith("MT-")
    if 'ribo' not in adata.var:         
        # get ribosomal genes
        adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
    if 'hb' not in adata.var:
        # get hemoglobin genes.
        adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))

    return adata

In [None]:
adata = load_pbmc_data()

We first preprocess the entire dataset. Then we compute for each celltype.l3 the mean dispersion of the genes. 

In [None]:
adata = preprocess_dataset(adata,
                           params_cell_filtering=dict(mad_tot_cnt=5, 
                                                     mad_ngenes_cnt=5, 
                                                     nr_top_genes=20,
                                                     mad_pct_cnt_top_genes=5,
                                                     mad_pct_mt=5,
                                                     min_pct_mt=9),
                            show=True)

In [None]:
plt.rcParams.update(
    {
        "pdf.fonttype": 42,
        "font.family": "sans-serif",
        "font.sans-serif": "Arial",
        "font.size": 10,
    }
)

Compute UMAP and color celltype level 3 notaions. 

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)

fig = sc.pl.umap(
    adata,
    color="celltype.l3",
    legend_fontsize=10,
    title="PBMC dataset with celltype level 3 annotaitions",
    return_fig=True,
)
fig.savefig(
    os.path.join(BASE_PATH_EXPERIMENTS, "control_genes_selection/mean_var_per_gene_scores/plots/umap_pbmc_data.pdf"),
    format="pdf",
)

Look at the number of cells and compute for each subtype the dispersion of the data.

In [None]:
from signaturescoring.utils.utils import get_mean_and_variance_gene_expression

In [None]:
import numpy as np
import pandas as pd

In [None]:
tot_nr_cells = len(adata.obs)
tot_nr_cells

In [None]:
cell_group_stats = []
for group_name, group_data in adata.obs.groupby('celltype.l3'):
    ratio_cells = len(group_data)/tot_nr_cells
    if ratio_cells>=0.01:
        df = get_mean_and_variance_gene_expression(adata[group_data.index,:])
        dispersion = df['var'].divide(df['mean'])
        
        cell_group_stats.append({
            'cell_group': group_name,
            'nr_cells': len(group_data),
            'ratio_cells': ratio_cells,
            'mean_dispersion': np.nanmean(dispersion), 
            'max_dispersion': np.nanmean(dispersion),
        })
        
        

In [None]:
cell_group_stats = pd.DataFrame(cell_group_stats).sort_values(by='mean_dispersion')

In [None]:
cell_group_stats['cell_group'].unique()

See the preprocessing for three random subtypes

In [None]:
def load_and_preprocess_subtype(subtype, show=True):
    # load dataset
    adata = load_pbmc_data()
    adata = adata[adata.obs['celltype.l3']==subtype,:].copy()

    # preprocess dataset
    adata = preprocess_dataset(adata,
                                params_cell_filtering=dict(mad_tot_cnt=5, 
                                                            mad_ngenes_cnt=5, 
                                                            nr_top_genes=20,
                                                            mad_pct_cnt_top_genes=5,
                                                            mad_pct_mt=5,
                                                            min_pct_mt=9),
                                show=show
                                )
    
    return adata

In [None]:
for subtype in ['CD8 TEM_1', 'MAIT', 'NK_2', 'Platelet']:
    load_and_preprocess_subtype(subtype)