In [4]:
import scanpy as sc
import pandas as pd
import random
import numpy as np
import matplotlib as plt
import gc 
import ctypes
import scvi



In [6]:
adata = sc.read_h5ad('../data/luca_query_reannotated.h5ad')

In [7]:
adata.obs

Unnamed: 0,sample,uicc_stage,ever_smoker,age,donor_id,origin,dataset,ann_fine,cell_type_predicted,doublet_status,...,IFN_TAMs,Reg_TAMs,Inflam_TAMs,LA_TAMs,Angio_TAMs,RTM_TAMs,Prolif_TAMs,Subtype,Projection_CellType,ident
001C_AAACCTGTCAACACCA-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage alveolar,Macrophage,singlet,...,0.353933,0.571664,0.298638,0.541069,0.427279,0.565822,0.401695,Reg_TAMs,RTM_TAMs,local
001C_AAACGGGAGGCTCATT-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage,Macrophage,singlet,...,0.477315,0.451494,0.370148,0.492239,0.537751,0.511095,0.348617,Angio_TAMs,LA_TAMs,local
001C_AAACGGGGTATAATGG-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage alveolar,Macrophage,singlet,...,0.387389,0.497444,0.282365,0.403790,0.443018,0.548354,0.328568,RTM_TAMs,RTM_TAMs,local
001C_AAACGGGTCACAAACC-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage,Macrophage,singlet,...,0.333177,0.592321,0.262999,0.594631,0.444907,0.450925,0.406192,LA_TAMs,LA_TAMs,local
001C_AAAGATGAGTGCTGCC-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage alveolar,Macrophage,singlet,...,0.358754,0.648105,0.282604,0.482257,0.408499,0.454457,0.291259,Reg_TAMs,RTM_TAMs,local
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGCTGTCTA-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.551482,0.640531,0.489101,0.623875,0.525708,0.514441,0.527691,Reg_TAMs,IFN_TAMs,local
TTTGTCAAGGATATAC-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.492014,0.699073,0.539889,0.612688,0.608443,0.551273,0.460248,Reg_TAMs,Reg_TAMs,local
TTTGTCAAGTACGCGA-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.470449,0.588994,0.611164,0.427515,0.471975,0.431026,0.455491,Inflam_TAMs,Inflam_TAMs,local
TTTGTCACATCTATGG-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.515349,0.589936,0.633147,0.577088,0.550079,0.487074,0.425062,Inflam_TAMs,LA_TAMs,local


In [8]:
adata.obs['Projection_CellType'].value_counts()


Projection_CellType
RTM_TAMs       94074
LA_TAMs        32428
Reg_TAMs       17933
IFN_TAMs       12215
Inflam_TAMs     6484
Angio_TAMs      5741
Int.Node.3      4221
Prolif_TAMs     3771
Int.Node.4       922
Int.Node.5       150
Name: count, dtype: int64

In [9]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [10]:
import gc 
import ctypes

In [6]:
def make_pseudobulk(adata, sample_colname, cell_type_colname, metacols):
    '''
    To create an an data with pseudo-bulk gene expression profiles for the samples and cell_types given.
    The function also carries metadata columns from the obs. metacols: as a list of col names.
    This is how to use:
        ps_adata = make_pseudobulk(hlca, sample_colname = 'sample', cell_type_colname = 'cell_type', metacols= ['patient', 'disease', 'tissue', 'study', 'sex'])
    '''
    ps_list = []
    for s in adata.obs[sample_colname].unique():
        adata_sub = adata[adata.obs[sample_colname] == s]
        
        for ct in adata_sub.obs[cell_type_colname].unique():
            adata_sub_ct = adata_sub[adata_sub.obs[cell_type_colname] == ct]
            
            adata_rep = sc.AnnData(X = adata_sub_ct.X.sum(axis = 0),
                                   var = adata_sub_ct.var[[]])
            samp_name=adata_sub.obs[sample_colname].iloc[0]
            cell= adata_sub_ct.obs[cell_type_colname].iloc[0]
            adata_rep.obs_names = [f"{samp_name}{cell}"]
            adata_rep.obs[cell_type_colname] = adata_sub_ct.obs[cell_type_colname].iloc[0]
            adata_rep.obs[metacols] = adata_sub_ct.obs[metacols].iloc[0]
            ps_list.append(adata_rep)

    ps_adata = sc.concat(ps_list)
    
    return ps_adata


In [None]:
def make_pseudobulk2(adata, sample_colname, cell_type_colname, metacols):
    '''
    Creates an AnnData object with pseudo-bulk gene expression profiles 
    for the samples and cell types specified. Also adds the number of cells 
    used per pseudobulk as "n_cells" in .obs.

    Parameters:
        adata: AnnData
        sample_colname: str
        cell_type_colname: str
        metacols: list of str
    
    Returns:
        AnnData with pseudobulked profiles and metadata.
    '''
    ps_list = []
    for s in adata.obs[sample_colname].unique():
        adata_sub = adata[adata.obs[sample_colname] == s]
        
        for ct in adata_sub.obs[cell_type_colname].unique():
            adata_sub_ct = adata_sub[adata_sub.obs[cell_type_colname] == ct]
            
            # Count how many cells are in this group
            n_cells = adata_sub_ct.shape[0]
            
            adata_rep = sc.AnnData(X=adata_sub_ct.X.sum(axis=0),
                                   var=adata_sub_ct.var[[]])
            
            samp_name = adata_sub.obs[sample_colname].iloc[0]
            cell = adata_sub_ct.obs[cell_type_colname].iloc[0]
            adata_rep.obs_names = [f"{samp_name}_{cell}"]
            
            # Fill in metadata and number of cells
            adata_rep.obs[cell_type_colname] = cell
            adata_rep.obs[metacols] = adata_sub_ct.obs[metacols].iloc[0]
            adata_rep.obs["n_cells"] = n_cells
            
            ps_list.append(adata_rep)

    ps_adata = sc.concat(ps_list)
    ps_adata.obs["n_cells"] = ps_adata.obs["n_cells"].astype(int)
    ps_adata.obs['log2_n_cells'] = np.log2(ps_adata.obs['n_cells'])

    return ps_adata


In [2]:
import scanpy as sc
import numpy as np
import random

def make_pseudobulk3(adata, sample_colname, cell_type_colname, metacols, min_cells=40, max_cells=50, random_state=0):
    '''
    Creates an AnnData object with pseudo-bulk gene expression profiles 
    for the samples and cell types specified. Also adds the number of cells 
    used per pseudobulk as "n_cells" in .obs.

    Parameters:
        adata: AnnData
        sample_colname: str
        cell_type_colname: str
        metacols: list of str
        min_cells: int, minimum number of cells required to include a group
        max_cells: int, maximum number of cells to include per group (randomly sampled if exceeded)
        random_state: int, for reproducibility of random sampling

    Returns:
        AnnData with pseudobulked profiles and metadata.
    '''
    ps_list = []
    rng = np.random.default_rng(random_state)

    for s in adata.obs[sample_colname].unique():
        adata_sub = adata[adata.obs[sample_colname] == s]
        
        for ct in adata_sub.obs[cell_type_colname].unique():
            adata_sub_ct = adata_sub[adata_sub.obs[cell_type_colname] == ct]
            n_cells_total = adata_sub_ct.shape[0]

            # Skip if too few cells
            if n_cells_total < min_cells:
                continue

            # Downsample if too many
            if n_cells_total > max_cells:
                selected_idx = rng.choice(adata_sub_ct.shape[0], size=max_cells, replace=False)
                adata_sub_ct = adata_sub_ct[selected_idx]
            else:
                selected_idx = slice(None)  # Use all

            n_cells = adata_sub_ct.shape[0]

            adata_rep = sc.AnnData(X=adata_sub_ct.X.sum(axis=0),
                                   var=adata_sub_ct.var[[]])
            
            samp_name = adata_sub.obs[sample_colname].iloc[0]
            cell = adata_sub_ct.obs[cell_type_colname].iloc[0]
            adata_rep.obs_names = [f"{samp_name}_{cell}"]
            
            # Fill in metadata and number of cells
            adata_rep.obs[cell_type_colname] = cell
            adata_rep.obs[metacols] = adata_sub_ct.obs[metacols].iloc[0]
            adata_rep.obs["n_cells"] = n_cells
            
            ps_list.append(adata_rep)

    ps_adata = sc.concat(ps_list)
    ps_adata.obs["n_cells"] = ps_adata.obs["n_cells"].astype(int)
    ps_adata.obs['log2_n_cells'] = np.log2(ps_adata.obs['n_cells']).astype(float)

    return ps_adata


In [11]:
import pandas as pd
import scipy.sparse

def savePBdata2(adata, filename):
    # Convert counts to dense matrix if it's sparse
    if scipy.sparse.issparse(adata.X):
        counts = adata.X.toarray()
    else:
        counts = adata.X

    # Convert to DataFrame
    counts_df = pd.DataFrame(counts, index=adata.obs_names, columns=adata.var_names)

    # (Optional) Convert to integer
    counts_df = counts_df.astype(int, errors='ignore')

    # Save AnnData object
    adata.write(filename)
    
    # Save counts as CSV for inspection (optional)
    counts_df.to_csv(filename.replace(".h5ad", "_counts.csv"))

    print(f"Saved AnnData to {filename} and count matrix to CSV.")


In [8]:
ps_adata_macs = make_pseudobulk(adata, sample_colname = 'sample', cell_type_colname = 'Projection_CellType', metacols= ['assay', 'donor_id', 'disease', 'tissue', 'study', 'sex', 'age', 'uicc_stage', 'tumor_stage'])



In [25]:
ps_adata_macs = make_pseudobulk2(adata, sample_colname = 'sample', cell_type_colname = 'Projection_CellType', metacols= ['assay', 'donor_id', 'disease', 'tissue', 'study', 'sex', 'age', 'uicc_stage', 'tumor_stage'])



In [None]:
#Pseudo-bulking with random down sampling once. min_cells=40, max_cells=50
ps_adata_macs = make_pseudobulk3(adata, sample_colname = 'sample', cell_type_colname = 'Projection_CellType', metacols= ['assay', 'donor_id', 'disease', 'tissue', 'study', 'sex', 'age', 'uicc_stage', 'tumor_stage'])

In [26]:
ps_adata_macs.obs


Unnamed: 0,Projection_CellType,assay,donor_id,disease,tissue,study,sex,age,uicc_stage,tumor_stage,n_cells
Adams_Kaminski_2020_001C_RTM_TAMs,RTM_TAMs,10x 3' v2,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,male,22.0,non-cancer,non-cancer,439
Adams_Kaminski_2020_001C_LA_TAMs,LA_TAMs,10x 3' v2,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,male,22.0,non-cancer,non-cancer,81
Adams_Kaminski_2020_001C_Int.Node.3,Int.Node.3,10x 3' v2,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,male,22.0,non-cancer,non-cancer,45
Adams_Kaminski_2020_001C_Prolif_TAMs,Prolif_TAMs,10x 3' v2,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,male,22.0,non-cancer,non-cancer,9
Adams_Kaminski_2020_001C_IFN_TAMs,IFN_TAMs,10x 3' v2,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,male,22.0,non-cancer,non-cancer,10
...,...,...,...,...,...,...,...,...,...,...,...
Leader_Merad_2021_414_LA_TAMs,LA_TAMs,10x 3' v2,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,female,64.0,II,early,180
Leader_Merad_2021_414_Inflam_TAMs,Inflam_TAMs,10x 3' v2,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,female,64.0,II,early,37
Leader_Merad_2021_414_Prolif_TAMs,Prolif_TAMs,10x 3' v2,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,female,64.0,II,early,11
Leader_Merad_2021_414_Int.Node.3,Int.Node.3,10x 3' v2,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,female,64.0,II,early,6


In [32]:
ps_adata_macs.obs['log2_n_cells'] = np.log2(ps_adata_macs.obs['n_cells'])

In [33]:
savePBdata2(ps_adata_macs, filename="../data/ps_adata_macs.h5ad")


Saved AnnData to ../data/ps_adata_macs.h5ad and count matrix to CSV.


In [None]:
with open('../data/Homo_sapiens.GRCh38.104.gtf') as f:
    gtf = list(f)

#prep the gtf file
gtf = [x for x in gtf if not x.startswith('#')]
gtf = [x for x in gtf if 'gene_id "' in x and 'gene_name "' in x]
gtf = list(map(lambda x: (x.split('gene_id "')[1].split('"')[0], x.split('gene_name "')[1].split('"')[0]), gtf))

In [10]:
ctyps = ps_adata_macs.obs['Projection_CellType'][~ps_adata_macs.obs['Projection_CellType'].isin(['Int.Node.3', 'Int.Node.4', 'Int.Node.5'])].unique()
ctyps

array(['RTM_TAMs', 'LA_TAMs', 'Prolif_TAMs', 'IFN_TAMs', 'Reg_TAMs',
       'Angio_TAMs', 'Inflam_TAMs'], dtype=object)

In [34]:
#sub_canc = ps_adata_macs[ps_adata_macs.obs['cell_type'] == 'malignant cell' ]
df = pd.DataFrame()

for c in ctyps:
    sub_c = ps_adata_macs[ps_adata_macs.obs['Projection_CellType'] == c ]
    sub_c.obs['Contrast'] = c
    pbs = [sub_c]
    print(c)
    sub_ct = ps_adata_macs[ps_adata_macs.obs['Projection_CellType'] != c ]
    sub_ct.obs['Contrast'] = 'others'
    pbs.append(sub_ct)
    pb = sc.concat(pbs)
    print(pb)
    counts= pd.DataFrame(pb.X, columns = pb.var_names)
    counts = counts.astype(int, errors='ignore')
    #To get rid off all the 0s in count matrix (to prevent problems)
    counts=counts+1
    dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
    sc.pp.filter_genes(dds, min_cells = 10)
    dds.deseq2()
    stat_res = DeseqStats(dds, contrast=('Contrast', c, 'others'))
    stat_res.summary()
    de  = stat_res.results_df
    de = de.sort_values('log2FoldChange', ascending = False)
        
    #get rownames aka gene symbols
    row_names = de.index.tolist()
    #row_names
    #add rownames as a new column 
    de['gene_symbols'] = row_names
    gtf=dict(gtf)
    de['gene_name'] = de['gene_symbols'].map(gtf)
    de = de[(de['padj'] < 0.05) & (de['log2FoldChange'] > 1.0) ]
    de['SubType'] = c
    de['inContrastTo'] = 'others'
    df = pd.concat([df, de])
    #save deg list
    df.to_csv('../results/macs_PB-DEGs.csv', index=True)

  sub_c.obs['Contrast'] = c
  sub_ct.obs['Contrast'] = 'others'


RTM_TAMs
AnnData object with n_obs × n_vars = 2890 × 17811
    obs: 'Projection_CellType', 'assay', 'donor_id', 'disease', 'tissue', 'study', 'sex', 'age', 'uicc_stage', 'tumor_stage', 'n_cells', 'log2_n_cells', 'Contrast'


  dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
Fitting size factors...


Using None as control genes, passed at DeseqDataSet initialization


... done in 2.55 seconds.

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
 

Log2 fold change & Wald test p-value: Contrast RTM_TAMs vs others
                  baseMean  log2FoldChange     lfcSE       stat        pvalue  \
ENSG00000121410   3.770136       -0.322875  0.054772  -5.894878  3.749576e-09   
ENSG00000268895   1.432926       -0.037837  0.064049  -0.590756  5.546837e-01   
ENSG00000175899  14.883531       -1.896052  0.090717 -20.900799  5.265583e-97   
ENSG00000245105   1.178039       -0.310995  0.071507  -4.349178  1.366490e-05   
ENSG00000166535   1.172041       -0.316945  0.075616  -4.191507  2.771079e-05   
...                    ...             ...       ...        ...           ...   
ENSG00000070476   2.052263        0.041787  0.063285   0.660305  5.090581e-01   
ENSG00000203995   1.158362       -0.270445  0.075837  -3.566152  3.622616e-04   
ENSG00000162378   3.316611        0.185820  0.041123   4.518625  6.224258e-06   
ENSG00000159840  13.447196       -0.015167  0.049345  -0.307372  7.585603e-01   
ENSG00000074755   3.159791       -0.255521 

  sub_ct.obs['Contrast'] = 'others'


AnnData object with n_obs × n_vars = 2890 × 17811
    obs: 'Projection_CellType', 'assay', 'donor_id', 'disease', 'tissue', 'study', 'sex', 'age', 'uicc_stage', 'tumor_stage', 'n_cells', 'log2_n_cells', 'Contrast'


  dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
Fitting size factors...


Using None as control genes, passed at DeseqDataSet initialization


... done in 2.64 seconds.

Fitting dispersions...
... done in 6.31 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 6.08 seconds.

Fitting LFCs...
... done in 7.35 seconds.

Calculating cook's distance...
... done in 3.84 seconds.

Replacing 48 outlier genes.

Fitting dispersions...
... done in 0.19 seconds.

Fitting MAP dispersions...
... done in 0.10 seconds.

Fitting LFCs...
... done in 0.12 seconds.

Running Wald tests...
... done in 5.58 seconds.

  sub_c.obs['Contrast'] = c
  sub_ct.obs['Contrast'] = 'others'


Log2 fold change & Wald test p-value: Contrast LA_TAMs vs others
                  baseMean  log2FoldChange     lfcSE       stat        pvalue  \
ENSG00000121410   3.770136        0.262113  0.049114   5.336829  9.458622e-08   
ENSG00000268895   1.432926       -0.005460  0.062478  -0.087399  9.303544e-01   
ENSG00000175899  14.883531        1.075426  0.075261  14.289295  2.551780e-46   
ENSG00000245105   1.178039        0.004975  0.070174   0.070899  9.434778e-01   
ENSG00000166535   1.172041        0.098219  0.071849   1.367013  1.716211e-01   
...                    ...             ...       ...        ...           ...   
ENSG00000070476   2.052263       -0.109854  0.060511  -1.815446  6.945539e-02   
ENSG00000203995   1.158362        0.066100  0.072968   0.905876  3.650016e-01   
ENSG00000162378   3.316611        0.066507  0.039419   1.687182  9.156830e-02   
ENSG00000159840  13.447196        0.244855  0.043864   5.582199  2.374967e-08   
ENSG00000074755   3.159791        0.121634  

  dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
Fitting size factors...


Using None as control genes, passed at DeseqDataSet initialization


... done in 2.57 seconds.

Fitting dispersions...
... done in 6.68 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 5.26 seconds.

Fitting LFCs...
... done in 7.71 seconds.

Calculating cook's distance...
... done in 3.89 seconds.

Replacing 53 outlier genes.

Fitting dispersions...
... done in 0.23 seconds.

Fitting MAP dispersions...
... done in 0.13 seconds.

Fitting LFCs...
... done in 0.13 seconds.

Running Wald tests...
... done in 5.09 seconds.

  sub_c.obs['Contrast'] = c
  sub_ct.obs['Contrast'] = 'others'


Log2 fold change & Wald test p-value: Contrast Prolif_TAMs vs others
                  baseMean  log2FoldChange     lfcSE      stat        pvalue  \
ENSG00000121410   3.770136       -0.302497  0.073322 -4.125571  3.698153e-05   
ENSG00000268895   1.432926       -0.032602  0.086369 -0.377472  7.058230e-01   
ENSG00000175899  14.883531       -0.928316  0.097018 -9.568474  1.084948e-21   
ENSG00000245105   1.178039        0.097582  0.085165  1.145807  2.518749e-01   
ENSG00000166535   1.172041        0.015589  0.088232  0.176685  8.597559e-01   
...                    ...             ...       ...       ...           ...   
ENSG00000070476   2.052263       -0.142195  0.084886 -1.675126  9.390955e-02   
ENSG00000203995   1.158362        0.041296  0.088161  0.468413  6.394890e-01   
ENSG00000162378   3.316611       -0.033230  0.064213 -0.517491  6.048136e-01   
ENSG00000159840  13.447196       -0.381460  0.059274 -6.435493  1.230730e-10   
ENSG00000074755   3.159791       -0.280118  0.07792

  dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
Fitting size factors...


Using None as control genes, passed at DeseqDataSet initialization


... done in 2.50 seconds.

Fitting dispersions...
... done in 6.17 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 5.49 seconds.

Fitting LFCs...
... done in 7.91 seconds.

Calculating cook's distance...
... done in 3.69 seconds.

Replacing 51 outlier genes.

Fitting dispersions...
... done in 0.20 seconds.

Fitting MAP dispersions...
... done in 0.09 seconds.

Fitting LFCs...
... done in 0.11 seconds.

Running Wald tests...
... done in 5.71 seconds.

  sub_c.obs['Contrast'] = c
  sub_ct.obs['Contrast'] = 'others'


Log2 fold change & Wald test p-value: Contrast IFN_TAMs vs others
                  baseMean  log2FoldChange     lfcSE      stat    pvalue  \
ENSG00000121410   3.770136       -0.086670  0.058476 -1.482135  0.138304   
ENSG00000268895   1.432926        0.043502  0.071766  0.606164  0.544406   
ENSG00000175899  14.883531        0.065197  0.083232  0.783318  0.433440   
ENSG00000245105   1.178039        0.101637  0.077058  1.318962  0.187182   
ENSG00000166535   1.172041       -0.006061  0.081135 -0.074698  0.940455   
...                    ...             ...       ...       ...       ...   
ENSG00000070476   2.052263       -0.101483  0.069574 -1.458625  0.144668   
ENSG00000203995   1.158362        0.032427  0.080889  0.400888  0.688502   
ENSG00000162378   3.316611       -0.168360  0.050121 -3.359084  0.000782   
ENSG00000159840  13.447196        0.198594  0.047908  4.145339  0.000034   
ENSG00000074755   3.159791       -0.078212  0.062157 -1.258300  0.208283   

                     

  dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
Fitting size factors...


Using None as control genes, passed at DeseqDataSet initialization


... done in 2.45 seconds.

Fitting dispersions...
... done in 6.42 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 5.73 seconds.

Fitting LFCs...
... done in 8.01 seconds.

Calculating cook's distance...
... done in 3.83 seconds.

Replacing 45 outlier genes.

Fitting dispersions...
... done in 0.19 seconds.

Fitting MAP dispersions...
... done in 0.11 seconds.

Fitting LFCs...
... done in 0.13 seconds.

Running Wald tests...
... done in 6.07 seconds.

  sub_c.obs['Contrast'] = c
  sub_ct.obs['Contrast'] = 'others'


Log2 fold change & Wald test p-value: Contrast Reg_TAMs vs others
                  baseMean  log2FoldChange     lfcSE       stat        pvalue  \
ENSG00000121410   3.770136        0.155393  0.051802   2.999733  2.702165e-03   
ENSG00000268895   1.432926        0.162255  0.064647   2.509841  1.207855e-02   
ENSG00000175899  14.883531        1.180946  0.073928  15.974241  1.931923e-57   
ENSG00000245105   1.178039        0.285879  0.070162   4.074574  4.609873e-05   
ENSG00000166535   1.172041        0.149298  0.075090   1.988254  4.678362e-02   
...                    ...             ...       ...        ...           ...   
ENSG00000070476   2.052263       -0.051523  0.062700  -0.821739  4.112252e-01   
ENSG00000203995   1.158362        0.122545  0.076160   1.609043  1.076069e-01   
ENSG00000162378   3.316611       -0.033940  0.042926  -0.790670  4.291366e-01   
ENSG00000159840  13.447196        0.001214  0.045135   0.026894  9.785444e-01   
ENSG00000074755   3.159791        0.079510 

  dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
Fitting size factors...


Using None as control genes, passed at DeseqDataSet initialization


... done in 2.61 seconds.

Fitting dispersions...
... done in 6.50 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 5.63 seconds.

Fitting LFCs...
... done in 8.37 seconds.

Calculating cook's distance...
... done in 3.84 seconds.

Replacing 51 outlier genes.

Fitting dispersions...
... done in 0.23 seconds.

Fitting MAP dispersions...
... done in 0.11 seconds.

Fitting LFCs...
... done in 0.12 seconds.

Running Wald tests...
... done in 5.13 seconds.

  sub_c.obs['Contrast'] = c
  sub_ct.obs['Contrast'] = 'others'


Log2 fold change & Wald test p-value: Contrast Angio_TAMs vs others
                  baseMean  log2FoldChange     lfcSE      stat        pvalue  \
ENSG00000121410   3.770136        0.201158  0.066388  3.030047  2.445153e-03   
ENSG00000268895   1.432926       -0.055674  0.085634 -0.650131  5.156076e-01   
ENSG00000175899  14.883531       -0.914673  0.098132 -9.320847  1.154153e-20   
ENSG00000245105   1.178039       -0.030196  0.087595 -0.344718  7.303061e-01   
ENSG00000166535   1.172041        0.019196  0.088254  0.217503  8.278160e-01   
...                    ...             ...       ...       ...           ...   
ENSG00000070476   2.052263       -0.064922  0.082475 -0.787176  4.311791e-01   
ENSG00000203995   1.158362        0.020160  0.088741  0.227179  8.202844e-01   
ENSG00000162378   3.316611       -0.122263  0.062894 -1.943947  5.190185e-02   
ENSG00000159840  13.447196       -0.047649  0.057162 -0.833580  4.045175e-01   
ENSG00000074755   3.159791       -0.033402  0.073553

  dds = DeseqDataSet(counts = counts, metadata=pb.obs, design_factors = ["log2_n_cells", "assay", "Contrast"])
Fitting size factors...


Using None as control genes, passed at DeseqDataSet initialization


... done in 2.56 seconds.

Fitting dispersions...
... done in 6.31 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 5.36 seconds.

Fitting LFCs...
... done in 7.77 seconds.

Calculating cook's distance...
... done in 3.87 seconds.

Replacing 52 outlier genes.

Fitting dispersions...
... done in 0.25 seconds.

Fitting MAP dispersions...
... done in 0.12 seconds.

Fitting LFCs...
... done in 0.13 seconds.

Running Wald tests...


Log2 fold change & Wald test p-value: Contrast Inflam_TAMs vs others
                  baseMean  log2FoldChange     lfcSE      stat    pvalue  \
ENSG00000121410   3.770136        0.087627  0.066526  1.317181  0.187778   
ENSG00000268895   1.432926        0.026944  0.082891  0.325058  0.745137   
ENSG00000175899  14.883531       -0.179597  0.094438 -1.901748  0.057204   
ENSG00000245105   1.178039        0.015466  0.086999  0.177769  0.858905   
ENSG00000166535   1.172041       -0.007850  0.089000 -0.088201  0.929717   
...                    ...             ...       ...       ...       ...   
ENSG00000070476   2.052263       -0.111297  0.081628 -1.363461  0.172737   
ENSG00000203995   1.158362       -0.022462  0.089855 -0.249979  0.802604   
ENSG00000162378   3.316611       -0.143782  0.060756 -2.366571  0.017954   
ENSG00000159840  13.447196        0.030202  0.056292  0.536520  0.591599   
ENSG00000074755   3.159791       -0.114879  0.073231 -1.568730  0.116711   

                  

... done in 5.44 seconds.

