In [1]:
import warnings
import os
from pathlib import Path 
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import anndata
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import decoupler as dc
import itertools
from scipy import sparse
#import lib.data_helpers as dh

#import lib.scanpy_helpers as sh
#import lib.pl.util as pu

  numba.core.entrypoints.init_all()


In [2]:
path = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/011_analysis_paired_remove_xy/pseudobulk/"
resDir_figures = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/figures/LUAD_DE/"
resDir_tables = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/"
input_path = f"{path}/paired_adata_clean.h5ad"

In [3]:
#adata = sc.read_h5ad(input_path) 

In [4]:
adata = sc.read_h5ad("/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/old/pseudobulk/paired_adata_clean_luad_all_cell_type.h5ad")

In [5]:
# Define the cell types to keep
selected_cell_types = {
    'macrophage',
    'CD4+ T cell',
    'CD8+ T cell',
    'B cell',
    'malignant cell',
    'regulatory T cell',
    'epithelial cell',
    'neutrophil',
    'dendritic cell',
    'plasma cell',
    'endothelial cell',
    'fibroblast'
}

# Mapping for synonymous cell types in adata
cell_type_mapping = {
    'fibroblast of lung': 'fibroblast',
    'vein endothelial cell': 'endothelial cell',
    'capillary endothelial cell': 'endothelial cell',
    'pulmonary artery endothelial cell': 'endothelial cell',
    'endothelial cell of lymphatic vessel': 'endothelial cell',
    'conventional dendritic cell': 'dendritic cell',
    'CD1c-positive myeloid dendritic cell': 'dendritic cell',
    'plasmacytoid dendritic cell': 'dendritic cell',
    'bronchus fibroblast of lung': 'fibroblast'
}

# Standardize cell types based on mapping
adata.obs['standardized_cell_type'] = adata.obs['cell_type'].replace(cell_type_mapping)

# Filter adata to keep only selected cell types
adata = adata[adata.obs['standardized_cell_type'].isin(selected_cell_types)].copy()

# Verify result
print(adata.obs['standardized_cell_type'].value_counts())


standardized_cell_type
macrophage           104271
CD4+ T cell           94503
CD8+ T cell           72603
epithelial cell       21039
malignant cell        16939
B cell                16853
regulatory T cell     16785
dendritic cell        16702
endothelial cell      16150
plasma cell           15096
neutrophil             6146
fibroblast             4674
Name: count, dtype: int64


## There are malignant cells in normal_adjacent samples
Comming from 59 different patients and 14 datasets
I cannot give an explanation for this so i remove this "mislalbeled" cells 

In [6]:
grouped = adata.obs.groupby(['standardized_cell_type', 'origin']).size().reset_index(name='cell_count')

In [7]:
grouped

Unnamed: 0,standardized_cell_type,origin,cell_count
0,epithelial cell,normal_adjacent,4957
1,epithelial cell,tumor_primary,16082
2,macrophage,normal_adjacent,65453
3,macrophage,tumor_primary,38818
4,B cell,normal_adjacent,2888
5,B cell,tumor_primary,13965
6,dendritic cell,normal_adjacent,6752
7,dendritic cell,tumor_primary,9950
8,CD4+ T cell,normal_adjacent,45654
9,CD4+ T cell,tumor_primary,48849


In [8]:
# Create a Boolean mask for the cells that you want to keep
mask = ~((adata.obs['origin'] == 'normal_adjacent') & (adata.obs['standardized_cell_type'] == 'malignant cell'))

# Subset the adata object using the mask to exclude the specific cells
adata = adata[mask].copy()

# Now `adata_filtered` contains all cells except those with origin == 'normal_adjacent' and cell_type == 'malignant cell'


In [9]:
grouped = adata.obs.groupby(['standardized_cell_type', 'origin']).size().reset_index(name='cell_count')
grouped

Unnamed: 0,standardized_cell_type,origin,cell_count
0,epithelial cell,normal_adjacent,4957
1,epithelial cell,tumor_primary,16082
2,macrophage,normal_adjacent,65453
3,macrophage,tumor_primary,38818
4,B cell,normal_adjacent,2888
5,B cell,tumor_primary,13965
6,dendritic cell,normal_adjacent,6752
7,dendritic cell,tumor_primary,9950
8,CD4+ T cell,normal_adjacent,45654
9,CD4+ T cell,tumor_primary,48849


In [10]:
adata_normal = adata[adata.obs["origin"]=="normal_adjacent"]
#adata_normal = adata_normal[~adata_normal.obs["cell_type"].isin(["malignant cell"])]

In [11]:
set(adata_normal.obs.cell_type)

{'B cell',
 'CD1c-positive myeloid dendritic cell',
 'CD4+ T cell',
 'CD8+ T cell',
 'bronchus fibroblast of lung',
 'capillary endothelial cell',
 'conventional dendritic cell',
 'dendritic cell',
 'endothelial cell of lymphatic vessel',
 'epithelial cell',
 'fibroblast of lung',
 'macrophage',
 'neutrophil',
 'plasma cell',
 'plasmacytoid dendritic cell',
 'pulmonary artery endothelial cell',
 'regulatory T cell',
 'vein endothelial cell'}

In [12]:
adata_tumor = adata[adata.obs["origin"]=="tumor_primary"]

In [13]:
set(adata_normal.obs.cell_type)

{'B cell',
 'CD1c-positive myeloid dendritic cell',
 'CD4+ T cell',
 'CD8+ T cell',
 'bronchus fibroblast of lung',
 'capillary endothelial cell',
 'conventional dendritic cell',
 'dendritic cell',
 'endothelial cell of lymphatic vessel',
 'epithelial cell',
 'fibroblast of lung',
 'macrophage',
 'neutrophil',
 'plasma cell',
 'plasmacytoid dendritic cell',
 'pulmonary artery endothelial cell',
 'regulatory T cell',
 'vein endothelial cell'}

In [14]:
adata_normal = adata[adata.obs["origin"]=="normal_adjacent"]
#adata_normal = adata_normal[~adata_normal.obs["cell_type"].isin(["malignant cell"])]

In [15]:
adata_tumor = adata[adata.obs["origin"]=="tumor_primary"]

In [16]:
adata

AnnData object with n_obs × n_vars = 401187 × 17811
    obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'ann_fine', 'cell_type_predicted', 'doublet_status', 'leiden', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'ann_coarse', 'cell_type_tumor', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'cell_type_major', 'cell_type_neutro', 'cell_type_neutro_coarse', 'suspension_type', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'standardized_cell_type'
    var: 'is_highly_variable', 'm

In [17]:
adata_male = adata[adata.obs["sex"]=="male"]

In [18]:
adata_female = adata[adata.obs["sex"]=="female"]

## Prepare for pseudobulk

In [19]:
contrasts = [
        dict(var = "origin", condition = "tumor_primary", reference = "normal_adjacent"),
    ]

In [20]:
contrasts

[{'var': 'origin',
  'condition': 'tumor_primary',
  'reference': 'normal_adjacent'}]

In [21]:
contrasts[0]["condition"].replace(" ", "_") + "_vs_" + contrasts[0]["reference"].replace(" ", "_")

'tumor_primary_vs_normal_adjacent'

In [22]:
resDir_tables

'/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/'

In [23]:
for contrast in contrasts:
    name = contrast["condition"].replace(" ", "_") + "_vs_" + contrast["reference"].replace(" ", "_")
    contrast["name"] = name
    res_dir = Path(resDir_tables, name, "tables")
    os.makedirs(resDir_tables, mode = 0o750, exist_ok = True)
    contrast["res_dir"] = resDir_tables

In [24]:
cell_type_class = "standardized_cell_type"

In [25]:
contrast["res_dir"]

'/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/'

In [26]:
pdata = dc.get_pseudobulk(adata,
                          sample_col='sample',
                          groups_col=cell_type_class,
                          layer='count',
                          mode='sum',
                          min_cells=10,
                          min_counts=1000
                         )

In [27]:
pdata.X = pdata.X.astype(int)

In [28]:
pdata.var_names = pdata.var["feature_name"].astype(str)

In [29]:
pdata_obs_df = pdata.obs

In [30]:
pdata

AnnData object with n_obs × n_vars = 1269 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [31]:
# Group by patient and filter for patients who have samples in both 'tumor' and 'normal_adjacent' conditions
patients_with_both_conditions = pdata_obs_df.groupby('donor_id')['origin'].apply(lambda x: set(x) >= {'tumor_primary', 'normal_adjacent'})

In [32]:
# Get the patient IDs that meet the condition
patients_with_both_conditions_ids = patients_with_both_conditions[patients_with_both_conditions].index

In [33]:
pdata_tumor_normal = pdata[pdata.obs['donor_id'].isin(patients_with_both_conditions_ids)]

In [34]:
cell_type_class

'standardized_cell_type'

In [35]:
## Run deseq2 on pseudobulk all cell types
#cell_types = pdata_tumor_normal.obs[cell_type_class].unique()

In [36]:
contrast["reference"]

'normal_adjacent'

In [37]:
contrast["condition"]

'tumor_primary'

In [38]:
contrast["name"]

'tumor_primary_vs_normal_adjacent'

In [39]:
pdata_tumor_normal.obs['sex'] = pdata_tumor_normal.obs['sex'].astype('category')

  pdata_tumor_normal.obs['sex'] = pdata_tumor_normal.obs['sex'].astype('category')


In [40]:
cpus=16

In [41]:
pdata_tumor_normal

AnnData object with n_obs × n_vars = 1268 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [42]:
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['epithelial cell of lung','multi-ciliated epithelial cell',], 'epithelial cell')
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['alveolar macrophage'], 'macrophage')
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['CD4-positive, alpha-beta T cell'], 'CD4+ T cell')
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['CD8-positive, alpha-beta T cell'], 'CD8+ T cell')

In [43]:
pdata_tumor_normal.obs["origin"].value_counts()

origin
tumor_primary      708
normal_adjacent    560
Name: count, dtype: int64

In [44]:
pdata_only_tumor = pdata_tumor_normal[pdata_tumor_normal.obs["origin"]=="tumor_primary"]

In [45]:
pdata_only_tumor

View of AnnData object with n_obs × n_vars = 708 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [46]:
resDir_tables

'/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/'

In [47]:
cell_types = pdata_only_tumor.obs[cell_type_class].unique()

In [48]:
pdata_tumor_normal_male = pdata_tumor_normal[pdata_tumor_normal.obs["sex"]=="male"]

In [64]:
pdata_tumor_normal_female = pdata_tumor_normal[pdata_tumor_normal.obs["sex"]=="female"]

In [49]:
pdata_tumor_normal_male.obs["origin"] = pdata_tumor_normal_male.obs["origin"].str.replace("_", "-", regex=False)


  pdata_tumor_normal_male.obs["origin"] = pdata_tumor_normal_male.obs["origin"].str.replace("_", "-", regex=False)


In [66]:
pdata_tumor_normal_female.obs["origin"] = pdata_tumor_normal_female.obs["origin"].str.replace("_", "-", regex=False)


  pdata_tumor_normal_female.obs["origin"] = pdata_tumor_normal_female.obs["origin"].str.replace("_", "-", regex=False)


In [68]:
pdata_tumor_normal_male = pdata_tumor_normal_male[~pdata_tumor_normal_male.obs["standardized_cell_type"].isin(["malignant cell"])]
pdata_tumor_normal_female= pdata_tumor_normal_female[~pdata_tumor_normal_female.obs["standardized_cell_type"].isin(["malignant cell"])]

In [69]:
contrast["reference"] = "normal-adjacent"
contrast["condition"] = "tumor-primary"
contrast["name"] = "tumor-primary_vs_normal-adjacent"

## Pydeseq tumor vs normal MALE

In [115]:
import pdb
for contrast in contrasts:
    de_res = {}

    for ct in cell_types:
        print(ct)
        
        pb_ct = pdata_tumor_normal_male[pdata_tumor_normal_male.obs[cell_type_class] == ct].copy()

     
        #pdb.set_trace()
        if len(set(pb_ct.obs["origin"]).intersection([contrast["reference"], contrast["condition"]])) < 1:
             print(
                 "Not running DEseq for: "
                  + ct
                  + " : only present in: "
                  + str(set(pb_ct.obs["sex"]).intersection([contrast["reference"], contrast["condition"]]))
             )
             continue
        #pdb.set_trace()
                     
        dds = DeseqDataSet(
           adata=pb_ct,
           design_factors=[contrast["var"]],
           ref_level=[contrast["var"], 'normal-adjacent'],
           refit_cooks=True,
           n_cpus=cpus,
           )
             
        

        # Compute LFCs
        dds.deseq2()

 

        

        # Extract contrast
        stat_res = DeseqStats(
              dds,
              contrast=[contrast["var"],'tumor-primary', 'normal-adjacent'],
              #n_cpus=cpus,
          )
   
    

        # Compute Wald test
        stat_res.summary()

        # Shrink LFCs

        coeff = contrast["var"] + "_" + contrast["name"]
        stat_res.lfc_shrink(coeff=coeff)

        # Register cell type results
        de_res[ct] = stat_res.results_df
        de_res[ct]["standardized_cell_type"] = ct
        de_res[ct]["feature_name"] = stat_res.results_df.index.values

        de_res[ct].drop(columns=["feature_name"]).to_csv(
                Path(
                    contrast["res_dir"],
                    ct.replace(" ", "_") + "_" + contrast["name"] + "_deseq.tsv",
                ),
                sep="\t",
            )

        # Register results for current contrast
        contrast["de_res"] = de_res
        contrast["de_res_all"] = pd.concat([df.assign(cell_type=ct) for ct, df in de_res.items()])

B cell


Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 1.19 seconds.

Fitting dispersion trend curve...
... done in 0.35 seconds.

Fitting MAP dispersions...
... done in 1.43 seconds.

Fitting LFCs...
... done in 1.49 seconds.

Calculating cook's distance...
... done in 0.06 seconds.

Replacing 59 outlier genes.

Fitting dispersions...
... done in 0.02 seconds.

Fitting MAP dispersions...
... done in 0.02 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Running Wald tests...
... done in 8.70 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          4.613264        0.000056  0.323332  0.000174  0.999861  0.999978
A1BG-AS1      0.591671       -0.200691  0.570672 -0.351676  0.725081       NaN
A2M           1.206051       -0.223805  0.620775 -0.360525  0.718455       NaN
A2M-AS1       0.082406       -1.195479  1.465479 -0.815760  0.414638       NaN
A2ML1         0.157295       -1.658056  1.488754 -1.113721  0.265399       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          0.928493       -0.355715  0.578193 -0.615219  0.538410       NaN
ZYG11A        0.066189       -0.837893  1.683108 -0.497825  0.618607       NaN
ZYG11B        1.394873        0.450700  0.493065  0.914079  0.360675       NaN
ZYX           3.686372        0.780150  0.348659  2.2

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 2.01 seconds.

Fitting size factors...


Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          4.613264        0.000005  0.052210  0.000174  0.999861  0.999978
A1BG-AS1      0.591671        0.000091  0.063279 -0.351676  0.725081       NaN
A2M           1.206051       -0.000674  0.053568 -0.360525  0.718455       NaN
A2M-AS1       0.082406        0.000124  0.060909 -0.815760  0.414638       NaN
A2ML1         0.157295       -0.000482  0.046209 -1.113721  0.265399       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          0.928493       -0.000956  0.053715 -0.615219  0.538410       NaN
ZYG11A        0.066189        0.000512  0.077715 -0.497825  0.618607       NaN
ZYG11B        1.394873       -0.005089  0.048999  0.914079  0.360675       NaN
ZYX           3.686372        0.006723  0.0617

... done in 0.04 seconds.

Fitting dispersions...
... done in 1.74 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 1.69 seconds.

Fitting LFCs...
... done in 1.19 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 883 outlier genes.

Fitting dispersions...
... done in 0.12 seconds.

Fitting MAP dispersions...
... done in 0.11 seconds.

Fitting LFCs...
... done in 0.09 seconds.

Running Wald tests...
... done in 8.69 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           68.409417       -0.403161  0.363153 -1.110170  0.266926   
A1BG-AS1       12.580317        0.623197  0.342646  1.818775  0.068946   
A2M            12.078685        0.277251  0.423670  0.654403  0.512852   
A2M-AS1        11.573832       -0.346347  0.355080 -0.975406  0.329359   
A2ML1           0.199986       -0.256593  1.561691 -0.164304  0.869491   
...                  ...             ...       ...       ...       ...   
ZXDC           13.965527        0.137765  0.302803  0.454965  0.649134   
ZYG11A          0.226355       -0.205677  1.871029 -0.109927  0.912467   
ZYG11B         22.581971       -0.044293  0.285489 -0.155148  0.876704   
ZYX           131.585482        0.032505  0.185704  0.175034  0.861053   
ZZEF1          39.032896       -0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 3.08 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           68.409417   -3.090000e-06  0.001431 -1.110170  0.266926   
A1BG-AS1       12.580317    5.328158e-06  0.001753  1.818775  0.068946   
A2M            12.078685    1.613500e-06  0.001617  0.654403  0.512852   
A2M-AS1        11.573832   -2.851820e-06  0.001456 -0.975406  0.329359   
A2ML1           0.199986   -3.554808e-08  0.001541 -0.164304  0.869491   
...                  ...             ...       ...       ...       ...   
ZXDC           13.965527    1.712778e-06  0.001600  0.454965  0.649134   
ZYG11A          0.226355   -8.508677e-08  0.001524 -0.109927  0.912467   
ZYG11B         22.581971   -5.581784e-07  0.001532 -0.155148  0.876704   
ZYX           131.585482    1.051806e-06  0.001554  0.175034  0.861053   
ZZEF1          39.032896   

Fitting size factors...
... done in 0.04 seconds.

Fitting dispersions...
... done in 1.80 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 1.71 seconds.

Fitting LFCs...
... done in 1.55 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 972 outlier genes.

Fitting dispersions...
... done in 0.13 seconds.

Fitting MAP dispersions...
... done in 0.13 seconds.

Fitting LFCs...
... done in 0.12 seconds.

Running Wald tests...
... done in 8.71 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           52.908189        0.200887  0.378468  0.530789  0.595565   
A1BG-AS1        9.445406       -0.103584  0.375120 -0.276136  0.782444   
A2M            14.829458       -0.553485  0.379607 -1.458047  0.144828   
A2M-AS1        32.573034       -1.071010  0.356349 -3.005506  0.002651   
A2ML1           0.261897        0.117141  1.952931  0.059982  0.952170   
...                  ...             ...       ...       ...       ...   
ZXDC           12.725702       -0.378616  0.356195 -1.062946  0.287806   
ZYG11A          0.283978       -0.662599  1.907608 -0.347345  0.728332   
ZYG11B         14.679519       -0.459053  0.311875 -1.471915  0.141044   
ZYX           136.861614        0.115195  0.172723  0.666935  0.504813   
ZZEF1          35.284278       -0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 3.27 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           52.908189    1.480970e-06  0.001538  0.530789  0.595565   
A1BG-AS1        9.445406   -3.433613e-07  0.001460 -0.276136  0.782444   
A2M            14.829458   -3.958412e-06  0.001349 -1.458047  0.144828   
A2M-AS1        32.573034   -9.029687e-06  0.001271 -3.005506  0.002651   
A2ML1           0.261897   -3.224398e-07  0.001260  0.059982  0.952170   
...                  ...             ...       ...       ...       ...   
ZXDC           12.725702   -2.981506e-06  0.001403 -1.062946  0.287806   
ZYG11A          0.283978   -1.865567e-07  0.001289 -0.347345  0.728332   
ZYG11B         14.679519   -4.997782e-06  0.001382 -1.471915  0.141044   
ZYX           136.861614    4.419663e-06  0.001517  0.666935  0.504813   
ZZEF1          35.284278   

Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 1.36 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 1.63 seconds.

Fitting LFCs...
... done in 1.36 seconds.

Calculating cook's distance...
... done in 0.06 seconds.

Replacing 59 outlier genes.

Fitting dispersions...
... done in 0.02 seconds.

Fitting MAP dispersions...
... done in 0.02 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Running Wald tests...
... done in 8.69 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          27.346383       -0.187979  0.249624 -0.753049  0.451421   
A1BG-AS1       2.723454       -0.021324  0.289433 -0.073676  0.941268   
A2M           36.400901       -0.455576  0.233276 -1.952943  0.050826   
A2M-AS1        0.704191        0.145368  0.665253  0.218515  0.827028   
A2ML1          0.103433       -0.189091  1.902912 -0.099369  0.920845   
...                 ...             ...       ...       ...       ...   
ZXDC           3.328346       -0.094078  0.293786 -0.320225  0.748798   
ZYG11A         0.103470        0.374354  2.423044  0.154498  0.877217   
ZYG11B         5.576791        0.038424  0.239230  0.160615  0.872397   
ZYX           75.852552        0.240159  0.096117  2.498604  0.012468   
ZZEF1          6.935962       -0.288888  0.200

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 3.30 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          27.346383   -3.148722e-06  0.001433 -0.753049  0.451421   
A1BG-AS1       2.723454   -2.306607e-07  0.001493 -0.073676  0.941268   
A2M           36.400901   -8.781803e-06  0.001381 -1.952943  0.050826   
A2M-AS1        0.704191    1.827489e-07  0.001549  0.218515  0.827028   
A2ML1          0.103433   -3.227013e-07  0.001283 -0.099369  0.920845   
...                 ...             ...       ...       ...       ...   
ZXDC           3.328346   -1.171606e-06  0.001486 -0.320225  0.748798   
ZYG11A         0.103470    3.866156e-07  0.002458  0.154498  0.877217   
ZYG11B         5.576791    7.035676e-07  0.001498  0.160615  0.872397   
ZYX           75.852552    2.682768e-05  0.001530  2.498604  0.012468   
ZZEF1          6.935962   -7.480573e-06

Fitting size factors...
... done in 0.02 seconds.

Fitting dispersions...
... done in 1.23 seconds.

Fitting dispersion trend curve...
... done in 0.37 seconds.

Fitting MAP dispersions...
... done in 1.43 seconds.

Fitting LFCs...
... done in 1.13 seconds.

Calculating cook's distance...
... done in 0.04 seconds.

Replacing 94 outlier genes.

Fitting dispersions...
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.03 seconds.

Fitting LFCs...
... done in 0.03 seconds.

Running Wald tests...
... done in 9.17 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            1.507280       -0.010133  0.444279 -0.022808  0.981803   
A1BG-AS1        0.249101        0.329657  0.870426  0.378730  0.704888   
A2M           152.621226        1.062725  0.220783  4.813427  0.000001   
A2M-AS1         0.446190        0.545755  0.822543  0.663498  0.507012   
A2ML1           0.026018       -0.157982  3.116414 -0.050693  0.959570   
...                  ...             ...       ...       ...       ...   
ZXDC            1.064180        0.649751  0.464808  1.397891  0.162146   
ZYG11A          0.056380        0.372103  3.103386  0.119902  0.904561   
ZYG11B          2.856751        0.249009  0.312447  0.796964  0.425472   
ZYX            11.161912       -0.118473  0.269639 -0.439376  0.660389   
ZZEF1           2.342306        0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 2.88 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            1.507280   -1.923844e-08  0.001366 -0.022808  0.981803   
A1BG-AS1        0.249101    7.402598e-07  0.001393  0.378730  0.704888   
A2M           152.621226    9.613744e-01  0.225848  4.813427  0.000001   
A2M-AS1         0.446190    1.136828e-06  0.001453  0.663498  0.507012   
A2ML1           0.026018   -1.667452e-07  0.001155 -0.050693  0.959570   
...                  ...             ...       ...       ...       ...   
ZXDC            1.064180    3.368096e-06  0.001402  1.397891  0.162146   
ZYG11A          0.056380    2.507463e-07  0.002485  0.119902  0.904561   
ZYG11B          2.856751    2.644848e-06  0.001389  0.796964  0.425472   
ZYX            11.161912   -1.628779e-06  0.001367 -0.439376  0.660389   
ZZEF1           2.342306   

Fitting size factors...
... done in 0.02 seconds.

Fitting dispersions...
... done in 1.35 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 1.49 seconds.

Fitting LFCs...
... done in 1.53 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 317 outlier genes.

Fitting dispersions...
... done in 0.10 seconds.

Fitting MAP dispersions...
... done in 0.09 seconds.

Fitting LFCs...
... done in 0.10 seconds.

Running Wald tests...
... done in 8.88 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          9.640422       -0.084555  0.506829 -0.166832  0.867502  0.931543
A1BG-AS1      2.222426       -0.811751  0.570795 -1.422142  0.154985  0.337826
A2M           2.187057        0.683752  0.561851  1.216964  0.223618  0.429649
A2M-AS1       0.366523        0.159750  1.005975  0.158801  0.873826       NaN
A2ML1         0.034975        0.105248  3.107701  0.033867  0.972983       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          3.827585       -0.355821  0.301693 -1.179414  0.238234  0.446787
ZYG11A        0.142235        0.434760  1.760640  0.246933  0.804960       NaN
ZYG11B        7.112276        0.099663  0.270380  0.368605  0.712422  0.841209
ZYX           5.964752        0.788411  0.361182  2.1

... done in 1.75 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          9.640422       -0.032273  0.381509 -0.166832  0.867502  0.931543
A1BG-AS1      2.222426       -0.300044  0.467179 -1.422142  0.154985  0.337826
A2M           2.187057        0.267037  0.469288  1.216964  0.223618  0.429649
A2M-AS1       0.366523        0.019218  0.497158  0.158801  0.873826       NaN
A2ML1         0.034975       -0.028946  0.468061  0.033867  0.972983       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          3.827585       -0.243915  0.279220 -1.179414  0.238234  0.446787
ZYG11A        0.142235        0.013593  0.590206  0.246933  0.804960       NaN
ZYG11B        7.112276        0.069289  0.248130  0.368605  0.712422  0.841209
ZYX           5.964752        0.559688  0.3621

Fitting size factors...
... done in 0.02 seconds.

Fitting dispersions...
... done in 1.21 seconds.

Fitting dispersion trend curve...
... done in 0.37 seconds.

Fitting MAP dispersions...
... done in 1.51 seconds.

Fitting LFCs...
... done in 1.48 seconds.

Calculating cook's distance...
... done in 0.03 seconds.

Replacing 129 outlier genes.

Fitting dispersions...
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.03 seconds.

Fitting LFCs...
... done in 0.04 seconds.

Running Wald tests...
... done in 8.63 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            6.599187        1.049944  0.505666  2.076356  0.037861   
A1BG-AS1        0.456214        0.669571  0.837132  0.799839  0.423804   
A2M           497.023760       -0.200188  0.392714 -0.509755  0.610223   
A2M-AS1         0.405584       -0.270720  0.764998 -0.353883  0.723427   
A2ML1           0.002863       -0.382078  3.204995 -0.119213  0.905106   
...                  ...             ...       ...       ...       ...   
ZXDC            1.776901       -0.167808  0.553695 -0.303070  0.761837   
ZYG11A          0.010638       -0.620068  3.191379 -0.194295  0.845945   
ZYG11B          5.866106       -0.055193  0.358517 -0.153949  0.877650   
ZYX            37.964425        0.020617  0.263309  0.078299  0.937590   
ZZEF1           2.391990        0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 2.95 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            6.599187    4.218071e-06  0.001870  2.076356  0.037861   
A1BG-AS1        0.456214    1.481234e-06  0.001874  0.799839  0.423804   
A2M           497.023760   -1.782393e-06  0.001507 -0.509755  0.610223   
A2M-AS1         0.405584   -3.363530e-07  0.001550 -0.353883  0.723427   
A2ML1           0.002863    6.766228e-08  0.002404 -0.119213  0.905106   
...                  ...             ...       ...       ...       ...   
ZXDC            1.776901   -4.230252e-07  0.001566 -0.303070  0.761837   
ZYG11A          0.010638   -1.701478e-07  0.001192 -0.194295  0.845945   
ZYG11B          5.866106   -4.316333e-07  0.001582 -0.153949  0.877650   
ZYX            37.964425    3.254124e-07  0.001589  0.078299  0.937590   
ZZEF1           2.391990   

Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 1.53 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.35 seconds.

Fitting MAP dispersions...
... done in 1.89 seconds.

Fitting LFCs...
... done in 1.18 seconds.

Calculating cook's distance...
... done in 0.07 seconds.

Replacing 116 outlier genes.

Fitting dispersions...
... done in 0.04 seconds.

Fitting MAP dispersions...
... done in 0.03 seconds.

Fitting LFCs...
... done in 0.03 seconds.

Running Wald tests...
... done in 8.95 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           85.516680        0.353116  0.267730  1.318924  0.187194   
A1BG-AS1       15.799651        0.184641  0.149826  1.232368  0.217812   
A2M           552.829138        1.522115  0.321093  4.740417  0.000002   
A2M-AS1         5.048961        0.212698  0.367639  0.578550  0.562893   
A2ML1           0.159131        0.310714  0.851395  0.364947  0.715151   
...                  ...             ...       ...       ...       ...   
ZXDC           21.092824       -0.220321  0.171323 -1.285999  0.198443   
ZYG11A          0.127500        0.500826  0.946272  0.529262  0.596624   
ZYG11B         84.127390       -0.360350  0.116007 -3.106279  0.001895   
ZYX           450.161470        0.048292  0.134613  0.358744  0.719786   
ZZEF1          47.326305       -0.

... done in 1.84 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           85.516680        0.259216  0.254724  1.318924  0.187194   
A1BG-AS1       15.799651        0.162138  0.146756  1.232368  0.217812   
A2M           552.829138        1.415559  0.328612  4.740417  0.000002   
A2M-AS1         5.048961        0.106252  0.315818  0.578550  0.562893   
A2ML1           0.159131       -0.080275  0.469801  0.364947  0.715151   
...                  ...             ...       ...       ...       ...   
ZXDC           21.092824       -0.191127  0.164089 -1.285999  0.198443   
ZYG11A          0.127500        0.010762  0.472897  0.529262  0.596624   
ZYG11B         84.127390       -0.339615  0.114463 -3.106279  0.001895   
ZYX           450.161470        0.051654  0.130568  0.358744  0.719786   
ZZEF1          47.326305   

Fitting size factors...
... done in 0.01 seconds.

Fitting dispersions...
... done in 0.92 seconds.

Fitting dispersion trend curve...
... done in 0.32 seconds.

Fitting MAP dispersions...
... done in 0.96 seconds.

Fitting LFCs...
... done in 1.11 seconds.

Calculating cook's distance...
... done in 0.01 seconds.

Replacing 0 outlier genes.

Running Wald tests...
... done in 8.77 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            1.219286       -1.271715  2.330923 -0.545584  0.585352   
A1BG-AS1        0.000000             NaN       NaN       NaN       NaN   
A2M             4.403454        0.032516  1.572091  0.020683  0.983498   
A2M-AS1         0.394540        0.647985  2.970378  0.218149  0.827313   
A2ML1           0.000000             NaN       NaN       NaN       NaN   
...                  ...             ...       ...       ...       ...   
ZXDC            7.328170       -0.284063  0.693867 -0.409392  0.682252   
ZYG11A          0.181016        2.184656  3.825929  0.571013  0.567991   
ZYG11B         17.418452       -0.466853  0.546436 -0.854360  0.392905   
ZYX           228.603724       -0.275976  0.352594 -0.782704  0.433801   
ZZEF1          11.721639       -0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 2.87 seconds.

Fitting size factors...


Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            1.219286   -2.184719e-07  0.001225 -0.545584  0.585352   
A1BG-AS1        0.000000             NaN       NaN       NaN       NaN   
A2M             4.403454    1.536174e-08  0.001393  0.020683  0.983498   
A2M-AS1         0.394540    1.373664e-08  0.001455  0.218149  0.827313   
A2ML1           0.000000             NaN       NaN       NaN       NaN   
...                  ...             ...       ...       ...       ...   
ZXDC            7.328170   -6.326384e-07  0.001344 -0.409392  0.682252   
ZYG11A          0.181016    2.100066e-07  0.002789  0.571013  0.567991   
ZYG11B         17.418452   -1.604203e-06  0.001316 -0.854360  0.392905   
ZYX           228.603724   -2.295424e-06  0.001372 -0.782704  0.433801   
ZZEF1          11.721639   

... done in 0.02 seconds.

Fitting dispersions...
... done in 1.24 seconds.

Fitting dispersion trend curve...
... done in 0.38 seconds.

Fitting MAP dispersions...
... done in 1.41 seconds.

Fitting LFCs...
... done in 1.30 seconds.

Calculating cook's distance...
... done in 0.05 seconds.

Replacing 104 outlier genes.

Fitting dispersions...
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.03 seconds.

Fitting LFCs...
... done in 0.04 seconds.

Running Wald tests...
... done in 8.85 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          20.520586       -0.359202  0.336171 -1.068509  0.285291   
A1BG-AS1       1.088761       -0.399286  0.651671 -0.612711  0.540068   
A2M            3.439281        0.355859  0.678731  0.524300  0.600070   
A2M-AS1        0.140953       -1.599917  1.999993 -0.799961  0.423733   
A2ML1          0.062925       -1.613224  2.929489 -0.550685  0.581850   
...                 ...             ...       ...       ...       ...   
ZXDC           0.549366       -0.570483  0.833440 -0.684492  0.493664   
ZYG11A         0.347641       -1.367677  0.925531 -1.477722  0.139482   
ZYG11B         2.327975       -0.417588  0.556814 -0.749959  0.453279   
ZYX            3.123633       -0.819743  0.402255 -2.037868  0.041563   
ZZEF1          2.614562        0.031174  0.454

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 3.21 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          20.520586   -3.330070e-06  0.001719 -1.068509  0.285291   
A1BG-AS1       1.088761   -7.542754e-07  0.002410 -0.612711  0.540068   
A2M            3.439281    7.998650e-07  0.002042  0.524300  0.600070   
A2M-AS1        0.140953   -6.126836e-07  0.001516 -0.799961  0.423733   
A2ML1          0.062925   -3.529262e-07  0.001321 -0.550685  0.581850   
...                 ...             ...       ...       ...       ...   
ZXDC           0.549366   -6.343671e-07  0.002370 -0.684492  0.493664   
ZYG11A         0.347641   -1.513020e-06  0.002137 -1.477722  0.139482   
ZYG11B         2.327975   -1.359097e-06  0.002098 -0.749959  0.453279   
ZYX            3.123633   -4.781257e-06  0.002172 -2.037868  0.041563   
ZZEF1          2.614562    1.738228e-07

Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 1.53 seconds.

Fitting dispersion trend curve...
... done in 0.36 seconds.

Fitting MAP dispersions...
... done in 1.43 seconds.

Fitting LFCs...
... done in 1.54 seconds.

Calculating cook's distance...
... done in 0.06 seconds.

Replacing 1377 outlier genes.

Fitting dispersions...
... done in 0.17 seconds.

Fitting MAP dispersions...
... done in 0.15 seconds.

Fitting LFCs...
... done in 0.14 seconds.

Running Wald tests...
... done in 8.75 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG           7.033111       -0.142249  0.394206 -0.360849  0.718212   
A1BG-AS1       1.485339        0.159139  0.623456  0.255254  0.798527   
A2M            1.273227        0.290248  0.728302  0.398527  0.690242   
A2M-AS1        0.173113       -1.321383  1.405063 -0.940444  0.346990   
A2ML1          0.176934       -2.254269  2.922899 -0.771244  0.440562   
...                 ...             ...       ...       ...       ...   
ZXDC           1.019239        0.850460  0.792190  1.073555  0.283022   
ZYG11A         0.014163       -2.067385  3.053920 -0.676961  0.498431   
ZYG11B         3.356246       -0.176350  0.442509 -0.398524  0.690244   
ZYX           14.740712       -0.403523  0.279140 -1.445596  0.148290   
ZZEF1          3.895481        0.452235  0.394

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))


Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG           7.033111   -9.155528e-07  0.001640 -0.360849  0.718212   
A1BG-AS1       1.485339    6.611014e-07  0.001920  0.255254  0.798527   
A2M            1.273227    8.387755e-07  0.001885  0.398527  0.690242   
A2M-AS1        0.173113    1.044228e-07  0.002154 -0.940444  0.346990   
A2ML1          0.176934   -7.613284e-08  0.001425 -0.771244  0.440562   
...                 ...             ...       ...       ...       ...   
ZXDC           1.019239    2.913140e-06  0.002257  1.073555  0.283022   
ZYG11A         0.014163    1.193448e-07  0.002688 -0.676961  0.498431   
ZYG11B         3.356246   -9.622742e-07  0.001682 -0.398524  0.690244   
ZYX           14.740712   -4.786789e-06  0.001580 -1.445596  0.148290   
ZZEF1          3.895481    3.102382e-06

... done in 3.28 seconds.



In [116]:
contrast["de_res_all"]

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,standardized_cell_type,feature_name,cell_type
feature_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A1BG,4.613264,5.484595e-06,0.052210,0.000174,0.999861,0.999978,B cell,A1BG,B cell
A1BG-AS1,0.591671,9.067974e-05,0.063279,-0.351676,0.725081,,B cell,A1BG-AS1,B cell
A2M,1.206051,-6.738284e-04,0.053568,-0.360525,0.718455,,B cell,A2M,B cell
A2M-AS1,0.082406,1.239167e-04,0.060909,-0.815760,0.414638,,B cell,A2M-AS1,B cell
A2ML1,0.157295,-4.821732e-04,0.046209,-1.113721,0.265399,,B cell,A2ML1,B cell
...,...,...,...,...,...,...,...,...,...
ZXDC,1.019239,2.913140e-06,0.002257,1.073555,0.283022,,regulatory T cell,ZXDC,regulatory T cell
ZYG11A,0.014163,1.193448e-07,0.002688,-0.676961,0.498431,,regulatory T cell,ZYG11A,regulatory T cell
ZYG11B,3.356246,-9.622742e-07,0.001682,-0.398524,0.690244,0.958322,regulatory T cell,ZYG11B,regulatory T cell
ZYX,14.740712,-4.786789e-06,0.001580,-1.445596,0.148290,0.708795,regulatory T cell,ZYX,regulatory T cell


In [118]:
resDir = resDir_tables

In [119]:
cell_types = contrast["de_res_all"]["cell_type"].unique()

# Loop through each cell type and save its data to CSV
for cell_type in cell_types:
    # Filter the DataFrame for the current cell type
    ct_all_deg = contrast["de_res_all"][contrast["de_res_all"]["cell_type"] == cell_type]
    
    # Remove spaces from cell_type for the filename
    cell_type_filename = cell_type.replace(" ", "")
    
    # Save the full data for the cell type
    filename_deg = f"{cell_type_filename}_deg.csv"
    filepath_deg = os.path.join(resDir, filename_deg)
    ct_all_deg.to_csv(filepath_deg, index=False)
    print(f"Saved full {cell_type} data to {filepath_deg}")
    
    # Apply additional filtering for significant results
    filtered_df = ct_all_deg[(ct_all_deg['padj'] < 0.1) & (ct_all_deg['log2FoldChange'].abs() > 1)]
    
    # Save the filtered significant data for the cell type
    filename_sig_deg = f"{cell_type_filename}_sig_deg.csv"
    filepath_sig_deg = os.path.join(resDir, filename_sig_deg)
    filtered_df.to_csv(filepath_sig_deg, index=False)
    print(f"Saved significant {cell_type} data to {filepath_sig_deg}")

Saved full B cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/Bcell_deg.csv
Saved significant B cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/Bcell_sig_deg.csv
Saved full CD4+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD4+Tcell_deg.csv
Saved significant CD4+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD4+Tcell_sig_deg.csv
Saved full CD8+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD8+Tcell_deg.csv
Saved significant CD8+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD8+Tcell_sig_deg.csv
Saved full dendritic cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/dendriticcell_deg.csv
Saved significant d

In [120]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Assuming `contrast` and `dc` are already defined and set up in your environment
cell_types = contrast["de_res_all"]["cell_type"].unique()

# Create a PDF file to save all volcano plots
with PdfPages(f"{resDir_figures}/volcano_plots_all_cell_types.pdf") as pdf:
    for cell_type in cell_types:
        # Filter results for the current cell type
        results_df = contrast["de_res_all"][contrast["de_res_all"]["cell_type"] == cell_type]

        # Generate the volcano plot
        plt.figure(figsize=(8, 4))
        dc.plot_volcano_df(
            results_df,
            x='log2FoldChange',
            y='padj',
            top=20,
            figsize=(8, 4)
        )
        
        # Set title to indicate cell type
        plt.title(f"Volcano Plot for Cell Type: {cell_type}")
        
        # Save the current figure to the PDF
        pdf.savefig()
        plt.close()  # Close the plot to avoid display issues in the next iteration

print("PDF with all volcano plots saved as 'volcano_plots_all_cell_types.pdf'")

PDF with all volcano plots saved as 'volcano_plots_all_cell_types.pdf'


<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

## Pydeseq tumor vs normal FEMALE

In [132]:
pdata_tumor_normal_female

View of AnnData object with n_obs × n_vars = 696 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [133]:
pdata_tumor_normal_male

View of AnnData object with n_obs × n_vars = 518 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [70]:
pdata_tumor_normal_female[pdata_tumor_normal_female.obs[cell_type_class] == "regulatory T cell"].obs.origin

Guo_Zhang_2018_P1010_TTY_regulatory T cell         tumor-primary
Guo_Zhang_2018_P1118_TTH_regulatory T cell         tumor-primary
Guo_Zhang_2018_P1202_TTH_regulatory T cell         tumor-primary
Guo_Zhang_2018_P1202_TTR_regulatory T cell         tumor-primary
Guo_Zhang_2018_P1202_TTY_regulatory T cell         tumor-primary
                                                      ...       
UKIM-V-2_P5_tumor_primary_regulatory T cell        tumor-primary
UKIM-V-2_P7_normal_adjacent_regulatory T cell    normal-adjacent
UKIM-V-2_P7_tumor_primary_regulatory T cell        tumor-primary
UKIM-V_P1_normal_adjacent_regulatory T cell      normal-adjacent
UKIM-V_P1_tumor_primary_regulatory T cell          tumor-primary
Name: origin, Length: 74, dtype: object

In [71]:
import pdb
for contrast in contrasts:
    de_res = {}

    for ct in cell_types:
        print(ct)
        
        pb_ct = pdata_tumor_normal_female[pdata_tumor_normal_female.obs[cell_type_class] == ct].copy()

     
        #pdb.set_trace()
        if len(set(pb_ct.obs["origin"]).intersection([contrast["reference"], contrast["condition"]])) < 1:
             print(
                 "Not running DEseq for: "
                  + ct
                  + " : only present in: "
                  + str(set(pb_ct.obs["sex"]).intersection([contrast["reference"], contrast["condition"]]))
             )
             continue
        #pdb.set_trace()
                     
        dds = DeseqDataSet(
           adata=pb_ct,
           design_factors=[contrast["var"]],
           ref_level=[contrast["var"], 'normal-adjacent'],
           refit_cooks=True,
           n_cpus=cpus,
           )
             
        

        # Compute LFCs
        dds.deseq2()

 

        

        # Extract contrast
        stat_res = DeseqStats(
              dds,
              contrast=[contrast["var"],'tumor-primary', 'normal-adjacent'],
              #n_cpus=cpus,
          )
   
    

        # Compute Wald test
        stat_res.summary()

        # Shrink LFCs

        coeff = contrast["var"] + "_" + contrast["name"]
        stat_res.lfc_shrink(coeff=coeff)

        # Register cell type results
        de_res[ct] = stat_res.results_df
        de_res[ct]["standardized_cell_type"] = ct
        de_res[ct]["feature_name"] = stat_res.results_df.index.values

        de_res[ct].drop(columns=["feature_name"]).to_csv(
                Path(
                    contrast["res_dir"],
                    ct.replace(" ", "_") + "_" + contrast["name"] + "_deseq.tsv",
                ),
                sep="\t",
            )

        # Register results for current contrast
        contrast["de_res"] = de_res
        contrast["de_res_all"] = pd.concat([df.assign(cell_type=ct) for ct, df in de_res.items()])

B cell


Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 1.89 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 2.31 seconds.

Fitting LFCs...
... done in 1.81 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 85 outlier genes.

Fitting dispersions...
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.03 seconds.

Fitting LFCs...
... done in 0.03 seconds.

Running Wald tests...
... done in 10.53 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          6.836024       -0.096811  0.290096 -0.333722  0.738589  0.998775
A1BG-AS1      1.168325       -0.078309  0.423935 -0.184720  0.853449       NaN
A2M           1.288579       -0.575284  0.495898 -1.160086  0.246014       NaN
A2M-AS1       0.247093       -0.922256  0.751221 -1.227676  0.219569       NaN
A2ML1         0.052073       -1.773977  1.897850 -0.934730  0.349927       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          2.439321       -0.560718  0.462610 -1.212075  0.225484  0.928499
ZYG11A        0.217108       -0.436415  0.821729 -0.531093  0.595354       NaN
ZYG11B        3.527944       -0.286148  0.383959 -0.745257  0.456117  0.988955
ZYX           5.614568        0.189883  0.225259  0.8

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 3.12 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          6.836024       -0.004149  0.095569 -0.333722  0.738589  0.998775
A1BG-AS1      1.168325        0.001322  0.120595 -0.184720  0.853449       NaN
A2M           1.288579       -0.008168  0.097222 -1.160086  0.246014       NaN
A2M-AS1       0.247093       -0.000984  0.116783 -1.227676  0.219569       NaN
A2ML1         0.052073       -0.001365  0.087317 -0.934730  0.349927       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          2.439321       -0.009504  0.093922 -1.212075  0.225484  0.928499
ZYG11A        0.217108        0.003583  0.122013 -0.531093  0.595354       NaN
ZYG11B        3.527944       -0.007244  0.095545 -0.745257  0.456117  0.988955
ZYX           5.614568        0.013493  0.1056

Fitting size factors...
... done in 0.05 seconds.

Fitting dispersions...
... done in 3.13 seconds.

Fitting dispersion trend curve...
... done in 0.40 seconds.

Fitting MAP dispersions...
... done in 2.75 seconds.

Fitting LFCs...
... done in 2.21 seconds.

Calculating cook's distance...
... done in 0.13 seconds.

Replacing 1979 outlier genes.

Fitting dispersions...
... done in 0.41 seconds.

Fitting MAP dispersions...
... done in 0.33 seconds.

Fitting LFCs...
... done in 0.30 seconds.

Running Wald tests...
... done in 10.57 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           68.592197       -0.112696  0.296928 -0.379540  0.704287   
A1BG-AS1       13.471159       -0.319386  0.363249 -0.879248  0.379267   
A2M            18.693596       -0.138240  0.366436 -0.377255  0.705984   
A2M-AS1        16.792654        0.039457  0.408430  0.096608  0.923038   
A2ML1           0.376706        0.285645  1.088145  0.262507  0.792931   
...                  ...             ...       ...       ...       ...   
ZXDC           27.036616        0.213716  0.325349  0.656881  0.511257   
ZYG11A          0.215535       -0.558586  1.337815 -0.417536  0.676286   
ZYG11B         41.940148        0.460023  0.320611  1.434833  0.151335   
ZYX           241.478463        0.134089  0.217365  0.616886  0.537310   
ZZEF1          64.229144        0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 6.05 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           68.592197   -1.325718e-06  0.001485 -0.379540  0.704287   
A1BG-AS1       13.471159   -2.685382e-06  0.001431 -0.879248  0.379267   
A2M            18.693596   -7.674849e-07  0.001450 -0.377255  0.705984   
A2M-AS1        16.792654    2.605234e-07  0.001519  0.096608  0.923038   
A2ML1           0.376706    2.437946e-07  0.001558  0.262507  0.792931   
...                  ...             ...       ...       ...       ...   
ZXDC           27.036616    1.912286e-06  0.001580  0.656881  0.511257   
ZYG11A          0.215535   -8.521301e-07  0.001244 -0.417536  0.676286   
ZYG11B         41.940148    4.772592e-06  0.001665  1.434833  0.151335   
ZYX           241.478463    5.043108e-06  0.001558  0.616886  0.537310   
ZZEF1          64.229144   

Fitting size factors...
... done in 0.05 seconds.

Fitting dispersions...
... done in 2.83 seconds.

Fitting dispersion trend curve...
... done in 0.40 seconds.

Fitting MAP dispersions...
... done in 2.67 seconds.

Fitting LFCs...
... done in 2.23 seconds.

Calculating cook's distance...
... done in 0.12 seconds.

Replacing 1688 outlier genes.

Fitting dispersions...
... done in 0.33 seconds.

Fitting MAP dispersions...
... done in 0.28 seconds.

Fitting LFCs...
... done in 0.41 seconds.

Running Wald tests...
... done in 10.56 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           49.259383        0.225264  0.330502  0.681581  0.495504   
A1BG-AS1        8.917060       -0.058168  0.367018 -0.158488  0.874072   
A2M            17.648511       -0.041725  0.345686 -0.120703  0.903926   
A2M-AS1        47.593598       -0.268557  0.371177 -0.723529  0.469355   
A2ML1           0.283424       -0.135752  1.442236 -0.094126  0.925009   
...                  ...             ...       ...       ...       ...   
ZXDC           21.772692       -0.171237  0.319904 -0.535276  0.592459   
ZYG11A          0.235857       -0.152158  1.613656 -0.094294  0.924876   
ZYG11B         19.735799       -0.146523  0.317097 -0.462077  0.644026   
ZYX           197.050642        0.048384  0.191843  0.252207  0.800881   
ZZEF1          50.589753        0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 6.20 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           49.259383    1.967730e-06  0.001547  0.681581  0.495504   
A1BG-AS1        8.917060   -5.853045e-07  0.001468 -0.158488  0.874072   
A2M            17.648511   -4.318617e-07  0.001448 -0.120703  0.903926   
A2M-AS1        47.593598   -2.011321e-06  0.001421 -0.723529  0.469355   
A2ML1           0.283424   -2.820914e-07  0.001375 -0.094126  0.925009   
...                  ...             ...       ...       ...       ...   
ZXDC           21.772692   -1.845970e-06  0.001442 -0.535276  0.592459   
ZYG11A          0.235857   -2.464692e-07  0.001346 -0.094294  0.924876   
ZYG11B         19.735799   -1.635186e-06  0.001451 -0.462077  0.644026   
ZYX           197.050642    5.255533e-06  0.001500  0.252207  0.800881   
ZZEF1          50.589753   

Fitting size factors...
... done in 0.04 seconds.

Fitting dispersions...
... done in 2.13 seconds.

Fitting dispersion trend curve...
... done in 0.40 seconds.

Fitting MAP dispersions...
... done in 2.41 seconds.

Fitting LFCs...
... done in 1.74 seconds.

Calculating cook's distance...
... done in 0.09 seconds.

Replacing 126 outlier genes.

Fitting dispersions...
... done in 0.04 seconds.

Fitting MAP dispersions...
... done in 0.04 seconds.

Fitting LFCs...
... done in 0.04 seconds.

Running Wald tests...
... done in 10.68 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          25.745214       -0.136336  0.248643 -0.548318  0.583473   
A1BG-AS1       2.712978        0.335790  0.266716  1.258979  0.208038   
A2M           38.975712        0.018136  0.230811  0.078574  0.937372   
A2M-AS1        0.986788       -0.151769  0.448897 -0.338092  0.735293   
A2ML1          0.087913       -0.245416  1.640254 -0.149621  0.881064   
...                 ...             ...       ...       ...       ...   
ZXDC           4.706702       -0.314368  0.247365 -1.270865  0.203777   
ZYG11A         0.195779       -0.499984  1.622292 -0.308196  0.757933   
ZYG11B         7.493308       -0.175285  0.266476 -0.657791  0.510673   
ZYX           73.714732        0.038158  0.137998  0.276514  0.782154   
ZZEF1          7.039874       -0.290618  0.196

... done in 2.80 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          25.745214       -0.025236  0.143650 -0.548318  0.583473   
A1BG-AS1       2.712978        0.062333  0.171494  1.258979  0.208038   
A2M           38.975712        0.003975  0.138517  0.078574  0.937372   
A2M-AS1        0.986788       -0.009414  0.159900 -0.338092  0.735293   
A2ML1          0.087913       -0.001084  0.168231 -0.149621  0.881064   
...                 ...             ...       ...       ...       ...   
ZXDC           4.706702       -0.066264  0.161961 -1.270865  0.203777   
ZYG11A         0.195779       -0.006260  0.155871 -0.308196  0.757933   
ZYG11B         7.493308       -0.029851  0.147666 -0.657791  0.510673   
ZYX           73.714732        0.016240  0.108703  0.276514  0.782154   
ZZEF1          7.039874       -0.095269

Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 2.00 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 2.34 seconds.

Fitting LFCs...
... done in 1.81 seconds.

Calculating cook's distance...
... done in 0.06 seconds.

Replacing 54 outlier genes.

Fitting dispersions...
... done in 0.02 seconds.

Fitting MAP dispersions...
... done in 0.02 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Running Wald tests...
... done in 10.64 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            3.420936        0.910405  0.282033  3.228006  0.001247   
A1BG-AS1        0.585804        0.199409  0.493449  0.404112  0.686130   
A2M           212.601385        0.414325  0.181063  2.288285  0.022121   
A2M-AS1         0.688039       -0.742509  0.565377 -1.313300  0.189082   
A2ML1           0.087024        0.205179  1.737480  0.118090  0.905996   
...                  ...             ...       ...       ...       ...   
ZXDC            2.902518       -0.082311  0.376034 -0.218892  0.826734   
ZYG11A          0.029248        0.272497  3.069963  0.088762  0.929271   
ZYG11B          6.029228        0.025885  0.249445  0.103769  0.917353   
ZYX            13.099102        0.405874  0.150850  2.690572  0.007133   
ZZEF1           3.714210        0.

... done in 2.60 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            3.420936        0.737634  0.286873  3.228006  0.001247   
A1BG-AS1        0.585804        0.027669  0.261104  0.404112  0.686130   
A2M           212.601385        0.321767  0.180599  2.288285  0.022121   
A2M-AS1         0.688039       -0.158076  0.338386 -1.313300  0.189082   
A2ML1           0.087024       -0.014676  0.278997  0.118090  0.905996   
...                  ...             ...       ...       ...       ...   
ZXDC            2.902518       -0.021069  0.237167 -0.218892  0.826734   
ZYG11A          0.029248       -0.008557  0.250941  0.088762  0.929271   
ZYG11B          6.029228        0.011692  0.193106  0.103769  0.917353   
ZYX            13.099102        0.335234  0.150847  2.690572  0.007133   
ZZEF1           3.714210   

Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 2.10 seconds.

Fitting dispersion trend curve...
... done in 0.42 seconds.

Fitting MAP dispersions...
... done in 2.43 seconds.

Fitting LFCs...
... done in 2.25 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 320 outlier genes.

Fitting dispersions...
... done in 0.10 seconds.

Fitting MAP dispersions...
... done in 0.10 seconds.

Fitting LFCs...
... done in 0.12 seconds.

Running Wald tests...
... done in 10.52 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          9.287246       -0.513872  0.423795 -1.212548  0.225303  0.409370
A1BG-AS1      1.914802       -1.516623  0.406009 -3.735440  0.000187  0.002369
A2M           4.109797        1.045371  0.580974  1.799342  0.071965  0.190632
A2M-AS1       0.398711       -1.678088  0.623440 -2.691658  0.007110       NaN
A2ML1         0.020607       -0.665224  3.023561 -0.220013  0.825861       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          4.202048       -0.243010  0.238332 -1.019626  0.307906  0.500928
ZYG11A        0.132186        0.348567  1.178512  0.295769  0.767407       NaN
ZYG11B        6.671836        0.285723  0.224307  1.273803  0.202733  0.380821
ZYX           5.263825        0.621081  0.359226  1.7

... done in 2.44 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          9.287246       -0.266714  0.371416 -1.212548  0.225303  0.409370
A1BG-AS1      1.914802       -1.333762  0.427292 -3.735440  0.000187  0.002369
A2M           4.109797        0.455060  0.566899  1.799342  0.071965  0.190632
A2M-AS1       0.398711       -1.501786  0.730030 -2.691658  0.007110       NaN
A2ML1         0.020607        0.026643  1.017134 -0.220013  0.825861       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          4.202048       -0.183487  0.225368 -1.019626  0.307906  0.500928
ZYG11A        0.132186        0.240686  0.997138  0.295769  0.767407       NaN
ZYG11B        6.671836        0.224880  0.215102  1.273803  0.202733  0.380821
ZYX           5.263825        0.401044  0.3490

Fitting size factors...
... done in 0.02 seconds.

Fitting dispersions...
... done in 1.64 seconds.

Fitting dispersion trend curve...
... done in 0.37 seconds.

Fitting MAP dispersions...
... done in 1.94 seconds.

Fitting LFCs...
... done in 1.72 seconds.

Calculating cook's distance...
... done in 0.03 seconds.

Replacing 227 outlier genes.

Fitting dispersions...
... done in 0.08 seconds.

Fitting MAP dispersions...
... done in 0.08 seconds.

Fitting LFCs...
... done in 0.08 seconds.

Running Wald tests...
... done in 10.47 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            2.753374        1.726825  0.544518  3.171291  0.001518   
A1BG-AS1        0.367909        0.801320  1.081224  0.741123  0.458619   
A2M           332.107188       -0.180609  0.446152 -0.404816  0.685613   
A2M-AS1         0.886699       -1.111197  0.833851 -1.332608  0.182660   
A2ML1           0.044637       -0.825762  3.140544 -0.262936  0.792600   
...                  ...             ...       ...       ...       ...   
ZXDC            0.825521       -0.428782  0.820566 -0.522544  0.601291   
ZYG11A          0.043281       -0.396306  3.140363 -0.126198  0.899575   
ZYG11B          2.116193        0.109004  0.421844  0.258398  0.796100   
ZYX            16.858977       -0.357000  0.247358 -1.443254  0.148949   
ZZEF1           1.476396        0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 5.83 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            2.753374    1.254283e+00  0.567963  3.171291  0.001518   
A1BG-AS1        0.367909    1.218913e-06  0.001864  0.741123  0.458619   
A2M           332.107188   -9.694307e-07  0.001422 -0.404816  0.685613   
A2M-AS1         0.886699   -1.665328e-06  0.001387 -1.332608  0.182660   
A2ML1           0.044637   -2.222531e-07  0.001163 -0.262936  0.792600   
...                  ...             ...       ...       ...       ...   
ZXDC            0.825521   -5.131180e-07  0.001479 -0.522544  0.601291   
ZYG11A          0.043281    1.588732e-07  0.002076 -0.126198  0.899575   
ZYG11B          2.116193    6.534478e-07  0.001639  0.258398  0.796100   
ZYX            16.858977   -6.113224e-06  0.001443 -1.443254  0.148949   
ZZEF1           1.476396   

Fitting size factors...
... done in 0.04 seconds.

Fitting dispersions...
... done in 2.28 seconds.

Fitting dispersion trend curve...
... done in 0.43 seconds.

Fitting MAP dispersions...
... done in 2.80 seconds.

Fitting LFCs...
... done in 2.00 seconds.

Calculating cook's distance...
... done in 0.10 seconds.

Replacing 193 outlier genes.

Fitting dispersions...
... done in 0.07 seconds.

Fitting MAP dispersions...
... done in 0.06 seconds.

Fitting LFCs...
... done in 0.07 seconds.

Running Wald tests...
... done in 10.71 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat        pvalue  \
feature_name                                                                 
A1BG           61.624709        0.409183  0.251234  1.628691  1.033785e-01   
A1BG-AS1       11.192793       -0.167666  0.155082 -1.081146  2.796321e-01   
A2M           493.374677        1.781544  0.229750  7.754284  8.884356e-15   
A2M-AS1         2.846238        0.051125  0.265131  0.192831  8.470915e-01   
A2ML1           0.657493        0.858547  1.140514  0.752772  4.515867e-01   
...                  ...             ...       ...       ...           ...   
ZXDC           17.776871       -0.070301  0.164487 -0.427396  6.690910e-01   
ZYG11A          0.181475        1.205039  1.073093  1.122959  2.614550e-01   
ZYG11B         74.453040       -0.376119  0.156126 -2.409072  1.599314e-02   
ZYX           373.453465        0.170851  0.138671  1.232060  2.

... done in 2.75 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat        pvalue  \
feature_name                                                                 
A1BG           61.624709        0.305510  0.243987  1.628691  1.033785e-01   
A1BG-AS1       11.192793       -0.145239  0.150885 -1.081146  2.796321e-01   
A2M           493.374677        1.729215  0.231919  7.754284  8.884356e-15   
A2M-AS1         2.846238        0.021143  0.233271  0.192831  8.470915e-01   
A2ML1           0.657493       -0.021213  0.453969  0.752772  4.515867e-01   
...                  ...             ...       ...       ...           ...   
ZXDC           17.776871       -0.059363  0.154806 -0.427396  6.690910e-01   
ZYG11A          0.181475        0.008360  0.483878  1.122959  2.614550e-01   
ZYG11B         74.453040       -0.337140  0.152797 -2.409072  1.599314e-02   
ZYX           373.453465        0.166083  0.135525  1.232

Fitting size factors...
... done in 0.01 seconds.

Fitting dispersions...
... done in 1.30 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.31 seconds.

Fitting MAP dispersions...
... done in 1.91 seconds.

Fitting LFCs...
... done in 1.67 seconds.

Calculating cook's distance...
... done in 0.01 seconds.

Replacing 165 outlier genes.

Fitting dispersions...
... done in 0.05 seconds.

Fitting MAP dispersions...
... done in 0.06 seconds.

Fitting LFCs...
... done in 0.05 seconds.

Running Wald tests...
... done in 10.64 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            0.049570        0.970603  2.468966  0.393121  0.694230   
A1BG-AS1        0.064780        0.332821  2.526817  0.131716  0.895209   
A2M             2.969021        0.220362  0.987247  0.223209  0.823373   
A2M-AS1         0.458062       -1.152237  1.550626 -0.743079  0.457434   
A2ML1           0.000000             NaN       NaN       NaN       NaN   
...                  ...             ...       ...       ...       ...   
ZXDC            6.761438       -0.736332  0.459147 -1.603695  0.108781   
ZYG11A          0.000000             NaN       NaN       NaN       NaN   
ZYG11B         10.061387       -0.433302  0.375258 -1.154676  0.248223   
ZYX           139.065266        0.022885  0.204549  0.111881  0.910918   
ZZEF1          10.449286       -0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 5.63 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG            0.049570    9.671307e-08  0.001417  0.393121  0.694230   
A1BG-AS1        0.064780   -2.228125e-07  0.001158  0.131716  0.895209   
A2M             2.969021    2.048603e-07  0.001419  0.223209  0.823373   
A2M-AS1         0.458062   -7.880722e-07  0.001169 -0.743079  0.457434   
A2ML1           0.000000             NaN       NaN       NaN       NaN   
...                  ...             ...       ...       ...       ...   
ZXDC            6.761438   -3.663442e-06  0.001255 -1.603695  0.108781   
ZYG11A          0.000000             NaN       NaN       NaN       NaN   
ZYG11B         10.061387   -3.271733e-06  0.001270 -1.154676  0.248223   
ZYX           139.065266    5.142546e-07  0.001379  0.111881  0.910918   
ZZEF1          10.449286   

Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 2.03 seconds.

Fitting dispersion trend curve...
... done in 0.42 seconds.

Fitting MAP dispersions...
... done in 2.32 seconds.

Fitting LFCs...
... done in 2.34 seconds.

Calculating cook's distance...
... done in 0.06 seconds.

Replacing 141 outlier genes.

Fitting dispersions...
... done in 0.05 seconds.

Fitting MAP dispersions...
... done in 0.05 seconds.

Fitting LFCs...
... done in 0.05 seconds.

Running Wald tests...
... done in 10.59 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          25.920354       -0.301475  0.222074 -1.357542  0.174609   
A1BG-AS1       1.295659        0.112272  0.416550  0.269529  0.787523   
A2M            9.845287       -0.090189  0.559427 -0.161217  0.871923   
A2M-AS1        0.645594       -1.290793  0.972078 -1.327870  0.184221   
A2ML1          0.015050       -1.425868  2.588165 -0.550919  0.581689   
...                 ...             ...       ...       ...       ...   
ZXDC           1.277678       -0.276857  0.451928 -0.612614  0.540132   
ZYG11A         0.331280       -0.223771  0.653289 -0.342529  0.731953   
ZYG11B         2.948444        0.453814  0.392684  1.155673  0.247815   
ZYX            4.748772       -0.254933  0.356634 -0.714831  0.474714   
ZZEF1          3.404421       -0.272120  0.319

... done in 2.71 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG          25.920354       -0.096844  0.183137 -1.357542  0.174609   
A1BG-AS1       1.295659        0.025356  0.239748  0.269529  0.787523   
A2M            9.845287       -0.004355  0.218801 -0.161217  0.871923   
A2M-AS1        0.645594       -0.010935  0.220984 -1.327870  0.184221   
A2ML1          0.015050        0.000010  0.248723 -0.550919  0.581689   
...                 ...             ...       ...       ...       ...   
ZXDC           1.277678       -0.013319  0.226521 -0.612614  0.540132   
ZYG11A         0.331280        0.017407  0.269614 -0.342529  0.731953   
ZYG11B         2.948444        0.055479  0.236746  1.155673  0.247815   
ZYX            4.748772       -0.031387  0.207016 -0.714831  0.474714   
ZZEF1          3.404421       -0.044202

Fitting size factors...
... done in 0.04 seconds.

Fitting dispersions...
... done in 2.48 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 2.21 seconds.

Fitting LFCs...
... done in 2.05 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 1459 outlier genes.

Fitting dispersions...
... done in 0.32 seconds.

Fitting MAP dispersions...
... done in 0.23 seconds.

Fitting LFCs...
... done in 0.21 seconds.

Running Wald tests...
... done in 10.88 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG           8.394548       -0.023311  0.335508 -0.069478  0.944609   
A1BG-AS1       1.646350        0.183794  0.459626  0.399877  0.689247   
A2M            1.120256       -0.538052  0.577106 -0.932329  0.351166   
A2M-AS1        0.964130       -0.977202  0.717689 -1.361596  0.173325   
A2ML1          0.039965       -2.118367  3.044130 -0.695886  0.486500   
...                 ...             ...       ...       ...       ...   
ZXDC           2.175864       -0.483790  0.414537 -1.167061  0.243186   
ZYG11A         0.057260       -2.055337  2.425106 -0.847525  0.396703   
ZYG11B         4.984575       -0.067816  0.385280 -0.176016  0.860281   
ZYX           19.014864       -0.347384  0.224488 -1.547451  0.121754   
ZZEF1          6.800469       -0.189774  0.282

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 6.10 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
               baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                            
A1BG           8.394548   -1.880110e-07  0.001673 -0.069478  0.944609   
A1BG-AS1       1.646350    3.721156e-06  0.002116  0.399877  0.689247   
A2M            1.120256   -5.311020e-07  0.001730 -0.932329  0.351166   
A2M-AS1        0.964130   -1.028639e-06  0.001682 -1.361596  0.173325   
A2ML1          0.039965    1.276819e-07  0.002271 -0.695886  0.486500   
...                 ...             ...       ...       ...       ...   
ZXDC           2.175864   -2.659679e-06  0.001786 -1.167061  0.243186   
ZYG11A         0.057260   -6.464404e-08  0.001548 -0.847525  0.396703   
ZYG11B         4.984575   -3.949415e-07  0.001750 -0.176016  0.860281   
ZYX           19.014864   -7.261606e-06  0.001601 -1.547451  0.121754   
ZZEF1          6.800469   -2.570082e-06

In [131]:
contrast["de_res_all"].to_csv("dea_female_tumor_vs_normal.csv")

In [73]:
resDir = resDir_tables

In [74]:
cell_types = contrast["de_res_all"]["cell_type"].unique()

# Loop through each cell type and save its data to CSV
for cell_type in cell_types:
    # Filter the DataFrame for the current cell type
    ct_all_deg = contrast["de_res_all"][contrast["de_res_all"]["cell_type"] == cell_type]
    
    # Remove spaces from cell_type for the filename
    cell_type_filename = cell_type.replace(" ", "")
    
    # Save the full data for the cell type
    filename_deg = f"{cell_type_filename}_deg_80cells.csv"
    filepath_deg = os.path.join(resDir, filename_deg)
    ct_all_deg.to_csv(filepath_deg, index=False)
    print(f"Saved full {cell_type} data to {filepath_deg}")
    
    # Apply additional filtering for significant results
    filtered_df = ct_all_deg[(ct_all_deg['padj'] < 0.1) & (ct_all_deg['log2FoldChange'].abs() > 1)]
    
    # Save the filtered significant data for the cell type
    filename_sig_deg = f"{cell_type_filename}_sig_deg.csv"
    filepath_sig_deg = os.path.join(resDir, filename_sig_deg)
    filtered_df.to_csv(filepath_sig_deg, index=False)
    print(f"Saved significant {cell_type} data to {filepath_sig_deg}")

Saved full B cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/Bcell_deg.csv
Saved significant B cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/Bcell_sig_deg.csv
Saved full CD4+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD4+Tcell_deg.csv
Saved significant CD4+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD4+Tcell_sig_deg.csv
Saved full CD8+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD8+Tcell_deg.csv
Saved significant CD8+ T cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/CD8+Tcell_sig_deg.csv
Saved full dendritic cell data to /data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/dendriticcell_deg.csv
Saved significant d

In [75]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Assuming `contrast` and `dc` are already defined and set up in your environment
cell_types = contrast["de_res_all"]["cell_type"].unique()

# Create a PDF file to save all volcano plots
with PdfPages(f"{resDir_figures}/volcano_plots_all_cell_types.pdf") as pdf:
    for cell_type in cell_types:
        # Filter results for the current cell type
        results_df = contrast["de_res_all"][contrast["de_res_all"]["cell_type"] == cell_type]

        # Generate the volcano plot
        plt.figure(figsize=(8, 4))
        dc.plot_volcano_df(
            results_df,
            x='log2FoldChange',
            y='padj',
            top=20,
            figsize=(8, 4)
        )
        
        # Set title to indicate cell type
        plt.title(f"Volcano Plot for Cell Type: {cell_type}")
        
        # Save the current figure to the PDF
        pdf.savefig()
        plt.close()  # Close the plot to avoid display issues in the next iteration

print("PDF with all volcano plots saved as 'volcano_plots_all_cell_types.pdf'")

PDF with all volcano plots saved as 'volcano_plots_all_cell_types.pdf'


<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

In [135]:
pdata_tumor_normal_female.obs

Unnamed: 0,sample,standardized_cell_type,uicc_stage,ever_smoker,donor_id,origin,dataset,tumor_stage,study,platform,...,tissue_ontology_term_id,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,psbulk_n_cells,psbulk_counts
He_Fan_2021_LUAD1_B cell,He_Fan_2021_LUAD1,B cell,I,no,He_Fan_2021_P1,tumor-primary,He_Fan_2021,early,He_Fan_2021,10x,...,UBERON:0002048,10x 3' v2,lung adenocarcinoma,Homo sapiens,female,lung,unknown,48-year-old human stage,89.0,118337.0
He_Fan_2021_LUAD3_B cell,He_Fan_2021_LUAD3,B cell,I,yes,He_Fan_2021_P3,tumor-primary,He_Fan_2021,early,He_Fan_2021,10x,...,UBERON:0002048,10x 3' v2,lung adenocarcinoma,Homo sapiens,female,lung,unknown,58-year-old human stage,45.0,44666.0
He_Fan_2021_LUAD5_B cell,He_Fan_2021_LUAD5,B cell,II,no,He_Fan_2021_P5,tumor-primary,He_Fan_2021,early,He_Fan_2021,10x,...,UBERON:0002048,10x 3' v2,lung adenocarcinoma,Homo sapiens,female,lung,unknown,56-year-old human stage,328.0,769382.0
He_Fan_2021_N1_B cell,He_Fan_2021_N1,B cell,I,no,He_Fan_2021_P1,normal-adjacent,He_Fan_2021,early,He_Fan_2021,10x,...,UBERON:0002048,10x 3' v2,lung adenocarcinoma,Homo sapiens,female,lung,unknown,48-year-old human stage,75.0,102672.0
He_Fan_2021_N3_B cell,He_Fan_2021_N3,B cell,I,yes,He_Fan_2021_P3,normal-adjacent,He_Fan_2021,early,He_Fan_2021,10x,...,UBERON:0002048,10x 3' v2,lung adenocarcinoma,Homo sapiens,female,lung,unknown,58-year-old human stage,21.0,26543.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKIM-V-2_P5_tumor_primary_regulatory T cell,UKIM-V-2_P5_tumor_primary,regulatory T cell,II,yes,UKIM-V-2_P5,tumor-primary,UKIM-V-2,early,UKIM-V,BD-Rhapsody,...,UBERON:0002048,BD Rhapsody Whole Transcriptome Analysis,lung adenocarcinoma,Homo sapiens,female,lung,unknown,62-year-old human stage,183.0,466299.0
UKIM-V-2_P7_normal_adjacent_regulatory T cell,UKIM-V-2_P7_normal_adjacent,regulatory T cell,III,yes,UKIM-V-2_P7,normal-adjacent,UKIM-V-2,advanced,UKIM-V,BD-Rhapsody,...,UBERON:0002048,BD Rhapsody Whole Transcriptome Analysis,lung adenocarcinoma,Homo sapiens,female,lung,unknown,69-year-old human stage,12.0,26681.0
UKIM-V-2_P7_tumor_primary_regulatory T cell,UKIM-V-2_P7_tumor_primary,regulatory T cell,III,yes,UKIM-V-2_P7,tumor-primary,UKIM-V-2,advanced,UKIM-V,BD-Rhapsody,...,UBERON:0002048,BD Rhapsody Whole Transcriptome Analysis,lung adenocarcinoma,Homo sapiens,female,lung,unknown,69-year-old human stage,90.0,188984.0
UKIM-V_P1_normal_adjacent_regulatory T cell,UKIM-V_P1_normal_adjacent,regulatory T cell,I,yes,UKIM-V_P1,normal-adjacent,UKIM-V,early,UKIM-V,BD-Rhapsody,...,UBERON:0002048,BD Rhapsody Whole Transcriptome Analysis,lung adenocarcinoma,Homo sapiens,female,lung,unknown,65-year-old human stage,34.0,548293.0


In [139]:
import pandas as pd
import scanpy as sc

def extract_b_cell_data(adata, gender):
    """
    Extracts B cell data from an AnnData object and returns count matrix and metadata.
    
    Parameters:
        adata (AnnData): The AnnData object
        gender (str): Gender label for samples ("female" or "male")

    Returns:
        count_matrix (pd.DataFrame): Gene expression matrix
        samplesheet (pd.DataFrame): Metadata for each sample
    """
    # Subset to only B cells
    b_cell_adata = adata[adata.obs["standardized_cell_type"] == "B cell"].copy()

    # Extract count matrix (assuming raw counts are stored in .X or .layers["counts"])
    count_matrix = pd.DataFrame(
        b_cell_adata.X.A if isinstance(b_cell_adata.X, np.matrix) else b_cell_adata.X,
        index=b_cell_adata.obs_names,
        columns=b_cell_adata.var_names
    )
    
    # Extract metadata for the samplesheet
    samplesheet = b_cell_adata.obs.copy()
    samplesheet["gender"] = gender  # Add gender column
    
    return count_matrix, samplesheet

# Process female and male datasets
count_matrix_female, samplesheet_female = extract_b_cell_data(pdata_tumor_normal_female, "female")
count_matrix_male, samplesheet_male = extract_b_cell_data(pdata_tumor_normal_male, "male")

# Combine results from both genders
final_count_matrix = pd.concat([count_matrix_female, count_matrix_male], axis=0)
final_samplesheet = pd.concat([samplesheet_female, samplesheet_male], axis=0)

# Save to CSV
final_count_matrix.T.to_csv("B_cell_count_matrix.csv")
final_samplesheet.to_csv("B_cell_samplesheet.csv")

print("Count matrix and samplesheet saved successfully.")


Count matrix and samplesheet saved successfully.
