In [1]:
import warnings
import os
from pathlib import Path 
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import anndata
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import decoupler as dc
import itertools
from scipy import sparse
#import lib.data_helpers as dh

#import lib.scanpy_helpers as sh
#import lib.pl.util as pu

  numba.core.entrypoints.init_all()


In [2]:
path = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/011_analysis_paired_remove_xy/pseudobulk/"
resDir_figures = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/figures/LUAD_DE/"
resDir_tables = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/"
input_path = f"{path}/paired_adata_clean.h5ad"

In [3]:
#adata = sc.read_h5ad(input_path) 

In [4]:
adata = sc.read_h5ad("/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/old/pseudobulk/paired_adata_clean_luad_all_cell_type.h5ad")

In [5]:
# Define the cell types to keep
selected_cell_types = {
    'macrophage',
    'CD4+ T cell',
    'CD8+ T cell',
    'B cell',
    'malignant cell',
    'regulatory T cell',
    'epithelial cell',
    'neutrophil',
    'dendritic cell',
    'plasma cell',
    'endothelial cell',
    'fibroblast'
}

# Mapping for synonymous cell types in adata
cell_type_mapping = {
    'fibroblast of lung': 'fibroblast',
    'vein endothelial cell': 'endothelial cell',
    'capillary endothelial cell': 'endothelial cell',
    'pulmonary artery endothelial cell': 'endothelial cell',
    'endothelial cell of lymphatic vessel': 'endothelial cell',
    'conventional dendritic cell': 'dendritic cell',
    'CD1c-positive myeloid dendritic cell': 'dendritic cell',
    'plasmacytoid dendritic cell': 'dendritic cell',
    'bronchus fibroblast of lung': 'fibroblast'
}

# Standardize cell types based on mapping
adata.obs['standardized_cell_type'] = adata.obs['cell_type'].replace(cell_type_mapping)

# Filter adata to keep only selected cell types
adata = adata[adata.obs['standardized_cell_type'].isin(selected_cell_types)].copy()

# Verify result
print(adata.obs['standardized_cell_type'].value_counts())


standardized_cell_type
macrophage           104271
CD4+ T cell           94503
CD8+ T cell           72603
epithelial cell       21039
malignant cell        16939
B cell                16853
regulatory T cell     16785
dendritic cell        16702
endothelial cell      16150
plasma cell           15096
neutrophil             6146
fibroblast             4674
Name: count, dtype: int64


## There are malignant cells in normal_adjacent samples
Comming from 59 different patients and 14 datasets
I cannot give an explanation for this so i remove this "mislalbeled" cells 

In [6]:
grouped = adata.obs.groupby(['standardized_cell_type', 'origin']).size().reset_index(name='cell_count')

In [7]:
grouped

Unnamed: 0,standardized_cell_type,origin,cell_count
0,epithelial cell,normal_adjacent,4957
1,epithelial cell,tumor_primary,16082
2,macrophage,normal_adjacent,65453
3,macrophage,tumor_primary,38818
4,B cell,normal_adjacent,2888
5,B cell,tumor_primary,13965
6,dendritic cell,normal_adjacent,6752
7,dendritic cell,tumor_primary,9950
8,CD4+ T cell,normal_adjacent,45654
9,CD4+ T cell,tumor_primary,48849


In [8]:
# Create a Boolean mask for the cells that you want to keep
mask = ~((adata.obs['origin'] == 'normal_adjacent') & (adata.obs['standardized_cell_type'] == 'malignant cell'))

# Subset the adata object using the mask to exclude the specific cells
adata = adata[mask].copy()

# Now `adata_filtered` contains all cells except those with origin == 'normal_adjacent' and cell_type == 'malignant cell'


In [9]:
grouped = adata.obs.groupby(['standardized_cell_type', 'origin']).size().reset_index(name='cell_count')
grouped

Unnamed: 0,standardized_cell_type,origin,cell_count
0,epithelial cell,normal_adjacent,4957
1,epithelial cell,tumor_primary,16082
2,macrophage,normal_adjacent,65453
3,macrophage,tumor_primary,38818
4,B cell,normal_adjacent,2888
5,B cell,tumor_primary,13965
6,dendritic cell,normal_adjacent,6752
7,dendritic cell,tumor_primary,9950
8,CD4+ T cell,normal_adjacent,45654
9,CD4+ T cell,tumor_primary,48849


In [10]:
adata_normal = adata[adata.obs["origin"]=="normal_adjacent"]
#adata_normal = adata_normal[~adata_normal.obs["cell_type"].isin(["malignant cell"])]

In [11]:
set(adata_normal.obs.cell_type)

{'B cell',
 'CD1c-positive myeloid dendritic cell',
 'CD4+ T cell',
 'CD8+ T cell',
 'bronchus fibroblast of lung',
 'capillary endothelial cell',
 'conventional dendritic cell',
 'dendritic cell',
 'endothelial cell of lymphatic vessel',
 'epithelial cell',
 'fibroblast of lung',
 'macrophage',
 'neutrophil',
 'plasma cell',
 'plasmacytoid dendritic cell',
 'pulmonary artery endothelial cell',
 'regulatory T cell',
 'vein endothelial cell'}

In [12]:
adata_tumor = adata[adata.obs["origin"]=="tumor_primary"]

In [13]:
set(adata_normal.obs.cell_type)

{'B cell',
 'CD1c-positive myeloid dendritic cell',
 'CD4+ T cell',
 'CD8+ T cell',
 'bronchus fibroblast of lung',
 'capillary endothelial cell',
 'conventional dendritic cell',
 'dendritic cell',
 'endothelial cell of lymphatic vessel',
 'epithelial cell',
 'fibroblast of lung',
 'macrophage',
 'neutrophil',
 'plasma cell',
 'plasmacytoid dendritic cell',
 'pulmonary artery endothelial cell',
 'regulatory T cell',
 'vein endothelial cell'}

In [14]:
adata_normal = adata[adata.obs["origin"]=="normal_adjacent"]
#adata_normal = adata_normal[~adata_normal.obs["cell_type"].isin(["malignant cell"])]

In [15]:
adata_tumor = adata[adata.obs["origin"]=="tumor_primary"]

In [16]:
adata

AnnData object with n_obs × n_vars = 401187 × 17811
    obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'ann_fine', 'cell_type_predicted', 'doublet_status', 'leiden', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'ann_coarse', 'cell_type_tumor', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'cell_type_major', 'cell_type_neutro', 'cell_type_neutro_coarse', 'suspension_type', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'standardized_cell_type'
    var: 'is_highly_variable', 'm

In [17]:
adata_male = adata[adata.obs["sex"]=="male"]

In [18]:
adata_female = adata[adata.obs["sex"]=="female"]

## Prepare for pseudobulk

In [19]:
contrasts = [
        dict(var = "origin", condition = "tumor_primary", reference = "normal_adjacent"),
    ]

In [20]:
contrasts

[{'var': 'origin',
  'condition': 'tumor_primary',
  'reference': 'normal_adjacent'}]

In [21]:
contrasts[0]["condition"].replace(" ", "_") + "_vs_" + contrasts[0]["reference"].replace(" ", "_")

'tumor_primary_vs_normal_adjacent'

In [22]:
resDir_tables

'/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/'

In [25]:
for contrast in contrasts:
    name = contrast["condition"].replace(" ", "_") + "_vs_" + contrast["reference"].replace(" ", "_")
    contrast["name"] = name
    res_dir = Path(resDir_tables, name, "tables")
    os.makedirs(resDir_tables, mode = 0o750, exist_ok = True)
    contrast["res_dir"] = resDir_tables

In [26]:
cell_type_class = "standardized_cell_type"

In [27]:
contrast["res_dir"]

'/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/'

In [28]:
pdata = dc.get_pseudobulk(adata,
                          sample_col='sample',
                          groups_col=cell_type_class,
                          layer='count',
                          mode='sum',
                          min_cells=10,
                          min_counts=1000
                         )

In [29]:
pdata.X = pdata.X.astype(int)

In [30]:
pdata.var_names = pdata.var["feature_name"].astype(str)

In [31]:
pdata_obs_df = pdata.obs

In [32]:
pdata

AnnData object with n_obs × n_vars = 1269 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [33]:
# Group by patient and filter for patients who have samples in both 'tumor' and 'normal_adjacent' conditions
patients_with_both_conditions = pdata_obs_df.groupby('donor_id')['origin'].apply(lambda x: set(x) >= {'tumor_primary', 'normal_adjacent'})

In [34]:
# Get the patient IDs that meet the condition
patients_with_both_conditions_ids = patients_with_both_conditions[patients_with_both_conditions].index

In [35]:
pdata_tumor_normal = pdata[pdata.obs['donor_id'].isin(patients_with_both_conditions_ids)]

In [36]:
cell_type_class

'standardized_cell_type'

In [37]:
## Run deseq2 on pseudobulk all cell types
#cell_types = pdata_tumor_normal.obs[cell_type_class].unique()

In [66]:
contrast["reference"]

'normal_adjacent'

In [67]:
contrast["condition"]

'tumor_primary'

In [91]:
contrast["name"]

'tumor_primary_vs_normal_adjacent'

In [40]:
pdata_tumor_normal.obs['sex'] = pdata_tumor_normal.obs['sex'].astype('category')

  pdata_tumor_normal.obs['sex'] = pdata_tumor_normal.obs['sex'].astype('category')


In [41]:
cpus=16

In [42]:
pdata_tumor_normal

AnnData object with n_obs × n_vars = 1268 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [44]:
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['epithelial cell of lung','multi-ciliated epithelial cell',], 'epithelial cell')
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['alveolar macrophage'], 'macrophage')
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['CD4-positive, alpha-beta T cell'], 'CD4+ T cell')
pdata_tumor_normal.obs["standardized_cell_type"]= pdata_tumor_normal.obs["standardized_cell_type"].replace(['CD8-positive, alpha-beta T cell'], 'CD8+ T cell')

In [57]:
pdata_tumor_normal.obs["origin"].value_counts()

origin
tumor_primary      708
normal_adjacent    560
Name: count, dtype: int64

In [47]:
pdata_only_tumor = pdata_tumor_normal[pdata_tumor_normal.obs["origin"]=="tumor_primary"]

In [48]:
pdata_only_tumor

View of AnnData object with n_obs × n_vars = 708 × 17811
    obs: 'sample', 'standardized_cell_type', 'uicc_stage', 'ever_smoker', 'donor_id', 'origin', 'dataset', 'tumor_stage', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'psbulk_n_cells', 'psbulk_counts'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    layers: 'psbulk_props'

In [50]:
resDir_tables

'/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/012_LUAD/03012025/tables/LUAD_DE/'

In [51]:
cell_types = pdata_only_tumor.obs[cell_type_class].unique()

In [59]:
pdata_tumor_normal_male = pdata_tumor_normal[pdata_tumor_normal.obs["sex"]=="male"]

In [60]:
pdata_tumor_normal_female = pdata_tumor_normal[pdata_tumor_normal.obs["sex"]=="female"]

In [82]:
pdata_tumor_normal_male.obs["origin"] = pdata_tumor_normal_male.obs["origin"].str.replace("_", "-", regex=False)


  pdata_tumor_normal_male.obs["origin"] = pdata_tumor_normal_male.obs["origin"].str.replace("_", "-", regex=False)


In [84]:
pdata_tumor_normal_female.obs["origin"] = pdata_tumor_normal_female.obs["origin"].str.replace("_", "-", regex=False)


  pdata_tumor_normal_female.obs["origin"] = pdata_tumor_normal_female.obs["origin"].str.replace("_", "-", regex=False)


In [92]:
contrast["reference"] = "normal-adjacent"
contrast["condition"] = "tumor-primary"
contrast["name"] = "tumor-primary_vs_normal-adjacent"

In [96]:
contrast["de_res_all"].standardized_cell_type.value_counts()

standardized_cell_type
B cell              17811
CD4+ T cell         17811
CD8+ T cell         17811
dendritic cell      17811
endothelial cell    17811
epithelial cell     17811
fibroblast          17811
macrophage          17811
Name: count, dtype: int64

In [97]:
cell_types

array(['B cell', 'CD4+ T cell', 'CD8+ T cell', 'dendritic cell',
       'endothelial cell', 'epithelial cell', 'fibroblast', 'macrophage',
       'malignant cell', 'neutrophil', 'plasma cell', 'regulatory T cell'],
      dtype=object)

## Pydeseq tumor vs normal

In [None]:
import pdb
for contrast in contrasts:
    de_res = {}

    for ct in cell_types:
        print(ct)
        
        pb_ct = pdata_tumor_normal_male[pdata_tumor_normal_male.obs[cell_type_class] == ct].copy()

     
        #pdb.set_trace()
        if len(set(pb_ct.obs["origin"]).intersection([contrast["reference"], contrast["condition"]])) < 1:
             print(
                 "Not running DEseq for: "
                  + ct
                  + " : only present in: "
                  + str(set(pb_ct.obs["sex"]).intersection([contrast["reference"], contrast["condition"]]))
             )
             continue
        #pdb.set_trace()
                     
        dds = DeseqDataSet(
           adata=pb_ct,
           design_factors=[contrast["var"]],
           ref_level=[contrast["var"], 'normal-adjacent'],
           refit_cooks=True,
           n_cpus=cpus,
           )
             
        

        # Compute LFCs
        dds.deseq2()

 

        

        # Extract contrast
        stat_res = DeseqStats(
              dds,
              contrast=[contrast["var"],'tumor-primary', 'normal-adjacent'],
              #n_cpus=cpus,
          )
   
    

        # Compute Wald test
        stat_res.summary()

        # Shrink LFCs

        coeff = contrast["var"] + "_" + contrast["name"]
        stat_res.lfc_shrink(coeff=coeff)

        # Register cell type results
        de_res[ct] = stat_res.results_df
        de_res[ct]["standardized_cell_type"] = ct
        de_res[ct]["feature_name"] = stat_res.results_df.index.values

        de_res[ct].drop(columns=["feature_name"]).to_csv(
                Path(
                    contrast["res_dir"],
                    ct.replace(" ", "_") + "_" + contrast["name"] + "_deseq.tsv",
                ),
                sep="\t",
            )

        # Register results for current contrast
        contrast["de_res"] = de_res
        contrast["de_res_all"] = pd.concat([df.assign(cell_type=ct) for ct, df in de_res.items()])

B cell


Fitting size factors...
... done in 0.03 seconds.

Fitting dispersions...
... done in 1.19 seconds.

Fitting dispersion trend curve...
... done in 0.36 seconds.

Fitting MAP dispersions...
... done in 1.42 seconds.

Fitting LFCs...
... done in 1.42 seconds.

Calculating cook's distance...
... done in 0.06 seconds.

Replacing 59 outlier genes.

Fitting dispersions...
... done in 0.02 seconds.

Fitting MAP dispersions...
... done in 0.02 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Running Wald tests...
... done in 8.72 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          4.613264        0.000056  0.323332  0.000174  0.999861  0.999978
A1BG-AS1      0.591671       -0.200691  0.570672 -0.351676  0.725081       NaN
A2M           1.206051       -0.223805  0.620775 -0.360525  0.718455       NaN
A2M-AS1       0.082406       -1.195479  1.465479 -0.815760  0.414638       NaN
A2ML1         0.157295       -1.658056  1.488754 -1.113721  0.265399       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          0.928493       -0.355715  0.578193 -0.615219  0.538410       NaN
ZYG11A        0.066189       -0.837893  1.683108 -0.497825  0.618607       NaN
ZYG11B        1.394873        0.450700  0.493065  0.914079  0.360675       NaN
ZYX           3.686372        0.780150  0.348659  2.2

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 1.96 seconds.

Fitting size factors...


Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
              baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
feature_name                                                                  
A1BG          4.613264        0.000005  0.052210  0.000174  0.999861  0.999978
A1BG-AS1      0.591671        0.000091  0.063279 -0.351676  0.725081       NaN
A2M           1.206051       -0.000674  0.053568 -0.360525  0.718455       NaN
A2M-AS1       0.082406        0.000124  0.060909 -0.815760  0.414638       NaN
A2ML1         0.157295       -0.000482  0.046209 -1.113721  0.265399       NaN
...                ...             ...       ...       ...       ...       ...
ZXDC          0.928493       -0.000956  0.053715 -0.615219  0.538410       NaN
ZYG11A        0.066189        0.000512  0.077715 -0.497825  0.618607       NaN
ZYG11B        1.394873       -0.005089  0.048999  0.914079  0.360675       NaN
ZYX           3.686372        0.006723  0.0617

... done in 0.04 seconds.

Fitting dispersions...
... done in 1.95 seconds.

Fitting dispersion trend curve...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 1.74 seconds.

Fitting LFCs...
... done in 1.23 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 883 outlier genes.

Fitting dispersions...
... done in 0.11 seconds.

Fitting MAP dispersions...
... done in 0.11 seconds.

Fitting LFCs...
... done in 0.10 seconds.

Running Wald tests...
... done in 8.77 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           68.409417       -0.403161  0.363153 -1.110170  0.266926   
A1BG-AS1       12.580317        0.623197  0.342646  1.818775  0.068946   
A2M            12.078685        0.277251  0.423670  0.654403  0.512852   
A2M-AS1        11.573832       -0.346347  0.355080 -0.975406  0.329359   
A2ML1           0.199986       -0.256593  1.561691 -0.164304  0.869491   
...                  ...             ...       ...       ...       ...   
ZXDC           13.965527        0.137765  0.302803  0.454965  0.649134   
ZYG11A          0.226355       -0.205677  1.871029 -0.109927  0.912467   
ZYG11B         22.581971       -0.044293  0.285489 -0.155148  0.876704   
ZYX           131.585482        0.032505  0.185704  0.175034  0.861053   
ZZEF1          39.032896       -0.

  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
  counts - (counts + size) / (1 + size * np.exp(-xbeta - offset))
... done in 3.11 seconds.



Shrunk log2 fold change & Wald test p-value: origin tumor-primary vs normal-adjacent
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
feature_name                                                             
A1BG           68.409417   -3.090000e-06  0.001431 -1.110170  0.266926   
A1BG-AS1       12.580317    5.328158e-06  0.001753  1.818775  0.068946   
A2M            12.078685    1.613500e-06  0.001617  0.654403  0.512852   
A2M-AS1        11.573832   -2.851820e-06  0.001456 -0.975406  0.329359   
A2ML1           0.199986   -3.554808e-08  0.001541 -0.164304  0.869491   
...                  ...             ...       ...       ...       ...   
ZXDC           13.965527    1.712778e-06  0.001600  0.454965  0.649134   
ZYG11A          0.226355   -8.508677e-08  0.001524 -0.109927  0.912467   
ZYG11B         22.581971   -5.581784e-07  0.001532 -0.155148  0.876704   
ZYX           131.585482    1.051806e-06  0.001554  0.175034  0.861053   
ZZEF1          39.032896   

Fitting size factors...
... done in 0.04 seconds.



In [None]:
cell_types = contrast["de_res_all"]["cell_type"].unique()

# Loop through each cell type and save its data to CSV
for cell_type in cell_types:
    # Filter the DataFrame for the current cell type
    ct_all_deg = contrast["de_res_all"][contrast["de_res_all"]["cell_type"] == cell_type]
    
    # Remove spaces from cell_type for the filename
    cell_type_filename = cell_type.replace(" ", "")
    
    # Save the full data for the cell type
    filename_deg = f"{cell_type_filename}_deg.csv"
    filepath_deg = os.path.join(resDir, filename_deg)
    ct_all_deg.to_csv(filepath_deg, index=False)
    print(f"Saved full {cell_type} data to {filepath_deg}")
    
    # Apply additional filtering for significant results
    filtered_df = ct_all_deg[(ct_all_deg['padj'] < 0.1) & (ct_all_deg['log2FoldChange'].abs() > 1)]
    
    # Save the filtered significant data for the cell type
    filename_sig_deg = f"{cell_type_filename}_sig_deg.csv"
    filepath_sig_deg = os.path.join(resDir, filename_sig_deg)
    filtered_df.to_csv(filepath_sig_deg, index=False)
    print(f"Saved significant {cell_type} data to {filepath_sig_deg}")

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Assuming `contrast` and `dc` are already defined and set up in your environment
cell_types = contrast["de_res_all"]["cell_type"].unique()

# Create a PDF file to save all volcano plots
with PdfPages(f"{resDir_figures}/volcano_plots_all_cell_types.pdf") as pdf:
    for cell_type in cell_types:
        # Filter results for the current cell type
        results_df = contrast["de_res_all"][contrast["de_res_all"]["cell_type"] == cell_type]

        # Generate the volcano plot
        plt.figure(figsize=(8, 4))
        dc.plot_volcano_df(
            results_df,
            x='log2FoldChange',
            y='padj',
            top=20,
            figsize=(8, 4)
        )
        
        # Set title to indicate cell type
        plt.title(f"Volcano Plot for Cell Type: {cell_type}")
        
        # Save the current figure to the PDF
        pdf.savefig()
        plt.close()  # Close the plot to avoid display issues in the next iteration

print("PDF with all volcano plots saved as 'volcano_plots_all_cell_types.pdf'")