# Pseudobulk & DEG Analysis

In [2]:
import scanpy as sc
import pandas as pd
import random
import numpy as np
import matplotlib as plt
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import gc 
import ctypes
import scvi

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
adata = sc.read_h5ad('/home/sharedFolder/int_fresh_start/descartes_ebru/Macs_from_LuCa_Projected_macs.h5ad')

In [4]:
count_matrix = sc.read_csv('/home/sharedFolder/int_fresh_start/descartes_ebru/samples_50_count_matrix.csv')

In [5]:
adata.obs["sample"]

001C_AAACCTGTCAACACCA-0    Adams_Kaminski_2020_001C
001C_AAACGGGAGGCTCATT-0    Adams_Kaminski_2020_001C
001C_AAACGGGGTATAATGG-0    Adams_Kaminski_2020_001C
001C_AAACGGGTCACAAACC-0    Adams_Kaminski_2020_001C
001C_AAAGATGAGTGCTGCC-0    Adams_Kaminski_2020_001C
                                     ...           
TTTGTCAAGCTGTCTA-1-38-8       Leader_Merad_2021_414
TTTGTCAAGGATATAC-1-38-8       Leader_Merad_2021_414
TTTGTCAAGTACGCGA-1-38-8       Leader_Merad_2021_414
TTTGTCACATCTATGG-1-38-8       Leader_Merad_2021_414
TTTGTCAGTGTTGGGA-1-38-8       Leader_Merad_2021_414
Name: sample, Length: 177939, dtype: category
Categories (414, object): ['Adams_Kaminski_2020_001C', 'Adams_Kaminski_2020_002C', 'Adams_Kaminski_2020_003C', 'Adams_Kaminski_2020_022C-a', ..., 'Zilionis_Klein_2019_p6t1', 'Zilionis_Klein_2019_p6t2', 'Zilionis_Klein_2019_p7t1', 'Zilionis_Klein_2019_p7t2']

In [6]:
count_matrix.obs_names

Index(['Adams_Kaminski_2020_001C', 'Adams_Kaminski_2020_002C',
       'Adams_Kaminski_2020_003C', 'Adams_Kaminski_2020_022C-a',
       'Adams_Kaminski_2020_022C-b', 'Adams_Kaminski_2020_034C',
       'Adams_Kaminski_2020_065C', 'Adams_Kaminski_2020_081C',
       'Adams_Kaminski_2020_084C', 'Adams_Kaminski_2020_092C',
       ...
       'Zilionis_Klein_2019_p3t3', 'Zilionis_Klein_2019_p4t1',
       'Zilionis_Klein_2019_p4t2', 'Zilionis_Klein_2019_p4t3',
       'Zilionis_Klein_2019_p5t1', 'Zilionis_Klein_2019_p5t2',
       'Zilionis_Klein_2019_p6t1', 'Zilionis_Klein_2019_p6t2',
       'Zilionis_Klein_2019_p7t1', 'Zilionis_Klein_2019_p7t2'],
      dtype='object', length=305)

In [7]:
# Subset koşulu: sadece belirtilen kategorilerdeki hücreleri seç
subset_condition = adata.obs['sample'].isin(count_matrix.obs_names)

# Anndata objesini subset yapma
adata_subset = adata[subset_condition].copy()

In [9]:
adata_subset.obs['TAMs'] = adata_subset.obs['Projection'].apply(lambda x: 'LA_TAMs' if x == 'LA_TAMs' else 'other_TAMs')

In [10]:
adata_subset.obs

Unnamed: 0,sample,uicc_stage,ever_smoker,age,donor_id,origin,dataset,ann_fine,cell_type_predicted,doublet_status,...,IFN_TAMs,Reg_TAMs,Inflam_TAMs,LA_TAMs,Angio_TAMs,RTM_TAMs,Prolif_TAMs,Subtype,Projection,TAMs
001C_AAACCTGTCAACACCA-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage alveolar,Macrophage,singlet,...,0.383643,0.533944,0.308520,0.583037,0.433892,0.555335,0.364967,LA_TAMs,RTM_TAMs,other_TAMs
001C_AAACGGGAGGCTCATT-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage,Macrophage,singlet,...,0.507515,0.386151,0.377544,0.579373,0.551071,0.497579,0.278150,LA_TAMs,Angio_TAMs,other_TAMs
001C_AAACGGGGTATAATGG-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage alveolar,Macrophage,singlet,...,0.420621,0.479085,0.289331,0.468865,0.471209,0.519101,0.262251,RTM_TAMs,RTM_TAMs,other_TAMs
001C_AAACGGGTCACAAACC-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage,Macrophage,singlet,...,0.343250,0.503152,0.267958,0.628695,0.457143,0.437045,0.394659,LA_TAMs,RTM_TAMs,other_TAMs
001C_AAAGATGAGTGCTGCC-0,Adams_Kaminski_2020_001C,non-cancer,no,22.0,Adams_Kaminski_2020_001C,normal,Adams_Kaminski_2020,Macrophage alveolar,Macrophage,singlet,...,0.372888,0.574821,0.290999,0.525428,0.410846,0.463959,0.291077,Reg_TAMs,RTM_TAMs,other_TAMs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGCTGTCTA-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.542869,0.559937,0.458438,0.673661,0.525954,0.489744,0.481909,LA_TAMs,RTM_TAMs,other_TAMs
TTTGTCAAGGATATAC-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.499064,0.641741,0.531002,0.647044,0.651234,0.581644,0.422489,Angio_TAMs,Angio_TAMs,other_TAMs
TTTGTCAAGTACGCGA-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.524469,0.460086,0.555955,0.507163,0.513380,0.436031,0.325137,Inflam_TAMs,Angio_TAMs,other_TAMs
TTTGTCACATCTATGG-1-38-8,Leader_Merad_2021_414,II,no,64.0,Leader_Merad_2021_729,tumor_primary,Leader_Merad_2021_10x_3p_v2_beads,Macrophage,,singlet,...,0.563514,0.562356,0.549811,0.615895,0.546339,0.490394,0.429672,LA_TAMs,Reg_TAMs,other_TAMs


## Pseudo-bulk

In [11]:
#libraries
#If not loaded already:
!pip install scanpy
!pip install pandas
!pip install matplotlib
!pip install pydeseq2



In [12]:
!pip install openpyxl



In [13]:
import scanpy as sc
import pandas as pd
import random
import numpy as np
import matplotlib as plt
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import gc 
import ctypes

In [14]:
def make_pseudobulk(adata, sample_colname, cell_type_colname, metacols):
    '''
    To create an an data with pseudo-bulk gene expression profiles for the samples and cell_types given.
    The function also carries metadata columns from the obs. metacols: as a list of col names.
    This is how to use:
        ps_adata = make_pseudobulk(hlca, sample_colname = 'sample', cell_type_colname = 'cell_type', metacols= ['patient', 'disease', 'tissue', 'study', 'sex'])
    '''
    ps_list = []
    for s in adata.obs[sample_colname].unique():
        adata_sub = adata[adata.obs[sample_colname] == s]
        
        for ct in adata_sub.obs[cell_type_colname].unique():
            adata_sub_ct = adata_sub[adata_sub.obs[cell_type_colname] == ct]
            
            adata_rep = sc.AnnData(X = adata_sub_ct.X.sum(axis = 0),
                                   var = adata_sub_ct.var[[]])
            samp_name=adata_sub.obs[sample_colname].iloc[0]
            cell= adata_sub_ct.obs[cell_type_colname].iloc[0]
            adata_rep.obs_names = [f"{samp_name}{cell}"]
            adata_rep.obs[cell_type_colname] = adata_sub_ct.obs[cell_type_colname].iloc[0]
            adata_rep.obs[metacols] = adata_sub_ct.obs[metacols].iloc[0]
            ps_list.append(adata_rep)

    ps_adata = sc.concat(ps_list)
    
    return ps_adata

In [15]:
ps_adata = make_pseudobulk(adata_subset, sample_colname = 'sample', cell_type_colname = 'Projection', metacols= ['uicc_stage','origin','donor_id', 'disease', 'tissue', 'study', 'tumor_stage','Subtype','TAMs'])


In [19]:
ps_adata.obs

Unnamed: 0,Projection,uicc_stage,origin,donor_id,disease,tissue,study,tumor_stage,Subtype,TAMs
Adams_Kaminski_2020_001CRTM_TAMs,RTM_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,other_TAMs
Adams_Kaminski_2020_001CAngio_TAMs,Angio_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,other_TAMs
Adams_Kaminski_2020_001CReg_TAMs,Reg_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,Reg_TAMs,other_TAMs
Adams_Kaminski_2020_001CLA_TAMs,LA_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,LA_TAMs
Adams_Kaminski_2020_001CProlif_TAMs,Prolif_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,other_TAMs
...,...,...,...,...,...,...,...,...,...,...
Leader_Merad_2021_414Inflam_TAMs,Inflam_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,Angio_TAMs,other_TAMs
Leader_Merad_2021_414Reg_TAMs,Reg_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,Angio_TAMs,other_TAMs
Leader_Merad_2021_414IFN_TAMs,IFN_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,IFN_TAMs,other_TAMs
Leader_Merad_2021_414Prolif_TAMs,Prolif_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,Angio_TAMs,other_TAMs


In [18]:
import scanpy as sc

# Örnek veriyi oku (örneğin)
# adata = sc.read_h5ad('filename.h5ad')  # Kendi verini bu şekilde okuyabilirsin

# Satır isimlerini al (.obs_names)
obs_names = ps_adata.obs_names

# Çıkarmak istediğin karakter gruplarını belirle
remove_suffixes = ['Int.Node.3', 'Int.Node.4', 'Int.Node.5']

# Satır isimlerinin bu karakter gruplarıyla bitip bitmediğini kontrol et
to_remove = obs_names.str.endswith(tuple(remove_suffixes))

# Çıkarılacak satırları veri setinden çıkarmak için tersini al
filtered_adata = ps_adata[~to_remove].copy()

# Sonucu kontrol et
print(filtered_adata)

AnnData object with n_obs × n_vars = 1930 × 17797
    obs: 'Projection', 'uicc_stage', 'origin', 'donor_id', 'disease', 'tissue', 'study', 'tumor_stage', 'Subtype', 'TAMs'


In [20]:
filtered_adata.obs

Unnamed: 0,Projection,uicc_stage,origin,donor_id,disease,tissue,study,tumor_stage,Subtype,TAMs
Adams_Kaminski_2020_001CRTM_TAMs,RTM_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,other_TAMs
Adams_Kaminski_2020_001CAngio_TAMs,Angio_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,other_TAMs
Adams_Kaminski_2020_001CReg_TAMs,Reg_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,Reg_TAMs,other_TAMs
Adams_Kaminski_2020_001CLA_TAMs,LA_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,LA_TAMs
Adams_Kaminski_2020_001CProlif_TAMs,Prolif_TAMs,non-cancer,normal,Adams_Kaminski_2020_001C,normal,lung,Adams_Kaminski_2020,non-cancer,LA_TAMs,other_TAMs
...,...,...,...,...,...,...,...,...,...,...
Leader_Merad_2021_414LA_TAMs,LA_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,LA_TAMs,LA_TAMs
Leader_Merad_2021_414Inflam_TAMs,Inflam_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,Angio_TAMs,other_TAMs
Leader_Merad_2021_414Reg_TAMs,Reg_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,Angio_TAMs,other_TAMs
Leader_Merad_2021_414IFN_TAMs,IFN_TAMs,II,tumor_primary,Leader_Merad_2021_729,lung adenocarcinoma,lung,Leader_Merad_2021,early,IFN_TAMs,other_TAMs


In [22]:
import scanpy as sc
import numpy as np

# Eğer X bir numpy matrix ise bunu numpy ndarray'e dönüştür
if isinstance(filtered_adata.X, np.matrix):
    filtered_adata.X = np.array(filtered_adata.X)

# H5AD dosyasını kaydet
filtered_adata.write('pseudobulk_data.h5ad')


## DEG Analysis

In [38]:
import scanpy as sc
import pandas as pd
import random
import numpy as np
import matplotlib as plt
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [39]:
import scipy.sparse

In [25]:
import pandas as pd
import numpy as np

# Hücre isimleri (her gözlemin adı)
cell_names = filtered_adata.obs_names

# Gen isimleri (her özelliğin adı, genler)
gene_names = filtered_adata.var_names

# X matrisini (count verilerini) al ve numpy array'e çevir
counts_matrix = filtered_adata.X.toarray() if scipy.sparse.issparse(filtered_adata.X) else filtered_adata.X

# Bunu pandas DataFrame olarak düzenleyelim
df_counts = pd.DataFrame(counts_matrix, index=cell_names, columns=gene_names)

# İlk birkaç satırı görelim
print(df_counts.head())

                                     ENSG00000121410  ENSG00000268895  \
Adams_Kaminski_2020_001CRTM_TAMs           10.684656              0.0   
Adams_Kaminski_2020_001CAngio_TAMs          0.000000              0.0   
Adams_Kaminski_2020_001CReg_TAMs            4.059573              0.0   
Adams_Kaminski_2020_001CLA_TAMs             1.036954              0.0   
Adams_Kaminski_2020_001CProlif_TAMs         0.881562              0.0   

                                     ENSG00000175899  ENSG00000245105  \
Adams_Kaminski_2020_001CRTM_TAMs           18.892044         0.000000   
Adams_Kaminski_2020_001CAngio_TAMs          0.000000         0.000000   
Adams_Kaminski_2020_001CReg_TAMs           15.684467         0.422638   
Adams_Kaminski_2020_001CLA_TAMs             6.067989         0.000000   
Adams_Kaminski_2020_001CProlif_TAMs         0.000000         0.000000   

                                     ENSG00000166535  ENSG00000128274  \
Adams_Kaminski_2020_001CRTM_TAMs            3.897

In [26]:
counts_df = pd.DataFrame(counts_matrix, index=cell_names, columns=gene_names)

metadata = pd.DataFrame(filtered_adata.obs)

print(counts_df)
print(metadata)

                                     ENSG00000121410  ENSG00000268895  \
Adams_Kaminski_2020_001CRTM_TAMs           10.684656         0.000000   
Adams_Kaminski_2020_001CAngio_TAMs          0.000000         0.000000   
Adams_Kaminski_2020_001CReg_TAMs            4.059573         0.000000   
Adams_Kaminski_2020_001CLA_TAMs             1.036954         0.000000   
Adams_Kaminski_2020_001CProlif_TAMs         0.881562         0.000000   
...                                              ...              ...   
Leader_Merad_2021_414LA_TAMs               11.553024         1.428158   
Leader_Merad_2021_414Inflam_TAMs            0.738252         0.000000   
Leader_Merad_2021_414Reg_TAMs               4.607873         0.897492   
Leader_Merad_2021_414IFN_TAMs               0.342176         0.000000   
Leader_Merad_2021_414Prolif_TAMs            0.278708         0.000000   

                                     ENSG00000175899  ENSG00000245105  \
Adams_Kaminski_2020_001CRTM_TAMs           18.8920

In [52]:

counts_df = counts_df.loc[:, (counts_df != 0).any()]  # Sadece herhangi bir 0 olmayan değeri olan kolonları tutar
counts_df = counts_df.replace(0, 1e-6)
counts_df = counts_df.astype(int)

In [53]:
print(counts_df)

                                     ENSG00000121410  ENSG00000268895  \
Adams_Kaminski_2020_001CRTM_TAMs                  10                0   
Adams_Kaminski_2020_001CAngio_TAMs                 0                0   
Adams_Kaminski_2020_001CReg_TAMs                   4                0   
Adams_Kaminski_2020_001CLA_TAMs                    1                0   
Adams_Kaminski_2020_001CProlif_TAMs                0                0   
...                                              ...              ...   
Leader_Merad_2021_414LA_TAMs                      11                1   
Leader_Merad_2021_414Inflam_TAMs                   0                0   
Leader_Merad_2021_414Reg_TAMs                      4                0   
Leader_Merad_2021_414IFN_TAMs                      0                0   
Leader_Merad_2021_414Prolif_TAMs                   0                0   

                                     ENSG00000175899  ENSG00000245105  \
Adams_Kaminski_2020_001CRTM_TAMs                  

In [62]:
(counts_df == 0).all(axis=1).value_counts()


False    1930
Name: count, dtype: int64

In [51]:
#inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    counts=counts_df,
    metadata=metadata,
    design_factors="TAMs",
    refit_cooks=True,
    #inference= inference
    #n_cpus=8, # n_cpus can be specified here or in the inference object
)

ValueError: The count matrix should only contain integers.

In [48]:
dds.deseq2()


Fitting size factors...
  self.fit_size_factors()
Fitting dispersions...
... done in 5.90 seconds.

Fitting MAP dispersions...
... done in 6.44 seconds.



KeyboardInterrupt: 

In [None]:
print(dds)

Statistical Analysis with dds;

In [36]:
stat_res = DeseqStats(dds)

AssertionError: Please provide a fitted DeseqDataSet by first running the `deseq2` method.