# 1.NSCLC Sex-stratified SC Analysis

## Data selection, pseudobulk, create input for DESEQ2

In [1]:
import warnings
import numpy as np
import pandas as pd 
import scanpy as sc
#import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
#import altair as alt
#import pertpy as pt
#from pandas.api.types import is_categorical_dtype
#from pandas import CategoricalDtype

In [2]:
import decoupler as dc

In [3]:
path = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp"
input_path    = f"{path}/data/local.h5ad"

In [None]:
adata = sc.read_h5ad(input_path) 
nsclc_chrom = pd.read_csv(f"{path}/out/007_re_analysis/tables/input/adata_var_nsclc_chrom.csv")

In [None]:
adata

In [None]:
adata.obs.disease.value_counts()

In [None]:
adata.obs.tumor_stage.value_counts()

In [None]:
adata.obs.sex.value_counts()

In [None]:
adata.obs.origin.value_counts()

### 01.SELECT ONLY male, female & tumor, normal_adjacent

In [None]:
adata = adata[(adata.obs["disease"] != "normal") & (adata.obs["disease"] != "chronic obstructive pulmonary disease")] #exclude normal and COPD samples
adata = adata[adata.obs["tumor_stage"].notna()] #exclude samples without tumor stage
adata = adata[(adata.obs["sex"]=="male")| (adata.obs["sex"]=="female")]

In [None]:
adata = adata[(adata.obs["origin"]=="tumor_primary")| (adata.obs["origin"]=="normal_adjacent")]

In [None]:
adata.obs.disease.value_counts()

In [None]:
adata.obs.tumor_stage.value_counts()

In [None]:
adata.obs.sex.value_counts()

In [None]:
adata.obs.origin.value_counts()

### 02.CELL TPYE RE-CLASIFICATION 

In [None]:
adata.obs.cell_type.value_counts()

In [None]:
adata.obs.cell_type_major.value_counts()

In [None]:
cells_of_interest=['alveolar macrophage', 'neutrophil',
 'macrophage','malignant cell',
 'B cell',
  'regulatory T cell',
'CD4-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta T cell',
 'neutrophils','epithelial cell of lung','multi-ciliated epithelial cell' ]
subset_adata = adata[adata.obs["cell_type"].isin(cells_of_interest)] 


In [None]:
set(subset_adata.obs.cell_type)

In [None]:
set(subset_adata.obs.cell_type_major)

In [None]:
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['epithelial cell of lung','multi-ciliated epithelial cell',], 'epithelial cell')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['alveolar macrophage'], 'macrophage')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['CD4-positive, alpha-beta T cell'], 'CD4+ T cell')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['CD8-positive, alpha-beta T cell'], 'CD8+ T cell')

In [None]:
set(subset_adata.obs.cell_type)

In [None]:
subset_adata

In [None]:
adata

Number of cells: 
- Original adata 1283972
- Adata after filtering for disease, sex, origin, tumor stage  887157
- Subset adata for cells of interest and renaming 638804

- In 00_get_chromosome I have retrieved the information for the chromosome and I append to subset_adata.

- Compute DEG (male vs female)
  Separate into 3 dataframes: autosomal_adata, xchrom_adata, ychrom_adata
- Create bar plot to compare Number of DEG in TUMOR vs NORMAL 

### 03.Get X, Y and autosomal genes

In [None]:
nsclc_chrom = nsclc_chrom.rename(columns={"ensembl_gene_id":"gene_id"})
nsclc_chrom = nsclc_chrom.iloc[:,1:]
nsclc_chrom = nsclc_chrom.set_index("gene_id")
nsclc_chrom["gene_id"] = nsclc_chrom.index

In [None]:
subset_adata.var["gene_id"] = subset_adata.var_names 
subset_adata_var_chrom =pd.concat([subset_adata.var, nsclc_chrom], axis = 1)
subset_adata.var = subset_adata_var_chrom

In [None]:
subset_adata.var["chromosome_name"]  = subset_adata.var["chromosome_name"].astype(str)

In [None]:
subset_indices = subset_adata.var['chromosome_name'] == 'X'

# Subset adata to keep only the genes with chromosome 'X'
xchrom_adata = subset_adata[:, subset_indices]

In [None]:
subset_indices = subset_adata.var['chromosome_name'] == 'Y'

# Subset adata to keep only the genes with chromosome 'Y'
ychrom_adata = subset_adata[:, subset_indices]

In [None]:
subset_indices = (subset_adata.var['chromosome_name'] != 'Y') &  (subset_adata.var['chromosome_name'] != 'X') 

# Subset adata to keep only the genes with chromosome 'Y'
autosomal_adata = subset_adata[:, subset_indices]

In [None]:
#xchrom_adata.var.to_csv(f"{path}/out/007_re_analysis/tables/input/xchrom_adata_var.csv")
#ychrom_adata.var.to_csv(f"{path}/out/007_re_analysis/tables/input/ychrom_adata_var.csv")
#autosomal_adata.var.to_csv(f"{path}/out/007_re_analysis/tables/input/autosomal_adata_var.csv")

In [42]:
xchrom_adata.shape

(638804, 618)

In [43]:
ychrom_adata.shape

(638804, 16)

In [41]:
autosomal_adata.shape

(638804, 17177)

### 04.Compute pseudobulk

In [45]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    subset_adata,
    sample_col='sample',
    groups_col='donor_id',
    layer='count',
    min_cells=0,
    min_counts=0
)

In [81]:
pdata_var_concat =pd.concat([pdata.var, nsclc_chrom],axis=1)

In [82]:
pdata_var_concat

Unnamed: 0_level_0,ensembl_gene_id_version,chromosome_name,start_position,end_position,gene_id
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000419,ENSG00000000419.14,20,50934867.0,50959140.0,ENSG00000000419
ENSG00000000457,ENSG00000000457.14,1,169849631.0,169894267.0,ENSG00000000457
ENSG00000000460,ENSG00000000460.17,1,169662007.0,169854080.0,ENSG00000000460
ENSG00000000938,ENSG00000000938.13,1,27612064.0,27635185.0,ENSG00000000938
ENSG00000000971,ENSG00000000971.17,1,196651754.0,196752476.0,ENSG00000000971
...,...,...,...,...,...
ENSG00000286522,ENSG00000286522.2,6,26031589.0,26032099.0,ENSG00000286522
ENSG00000287080,ENSG00000287080.2,6,26045384.0,26045869.0,ENSG00000287080
ENSG00000287151,ENSG00000287151.1,2,131647990.0,131767404.0,ENSG00000287151
ENSG00000288649,ENSG00000288649.2,20,33666943.0,33668525.0,ENSG00000288649


In [53]:
pdata.obs["age"] = pdata.obs["age"].astype("float64")
pdata.obs["is_primary_data"] = pdata.obs["is_primary_data"].astype(bool)


In [8]:
#pdata.write_h5ad(f"{path}/out/007_re_analysis/tables/pdata.h5ad")
pdata = sc.read_h5ad(f"{path}/out/007_re_analysis/tables/input/pdata.h5ad")

In [9]:
pdata

AnnData object with n_obs × n_vars = 289 × 9304
    obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'doublet_status', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'

In [11]:
pdata.layers["counts"] = pdata.X

In [12]:
scales_counts = sc.pp.normalize_total(pdata, target_sum=None, inplace=False)
# log1p transform
pdata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [13]:
pdata

AnnData object with n_obs × n_vars = 289 × 9304
    obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'doublet_status', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    layers: 'counts', 'log1p_norm'

In [5]:
### Samplesheet
samplesheet = pdata.obs.copy()
samplesheet["sample"]=samplesheet.index
samplesheet.reset_index(inplace=True)

In [5]:
### Samplesheet
samplesheet = pdata_normal.obs.copy()
samplesheet["sample"]=samplesheet.index
samplesheet.reset_index(inplace=True)

In [6]:
samplesheet.to_csv(f"{path}/out/007_re_analysis/tables/input/samplesheet.csv", index=False)
samplesheet.to_csv(f"{path}/out/007_re_analysis/tables/input/samplesheet.tsv", index=False, sep = "\t")

In [8]:
bulk_df = pdata.to_df().T

In [17]:
### Raw counts
bulk_df = pdata.to_df().T
#bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")

  bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")


In [18]:
pdata.to_df()

gene_id,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001461,ENSG00000001497,...,ENSG00000279170,ENSG00000280789,ENSG00000281103,ENSG00000281649,ENSG00000282851,ENSG00000284194,ENSG00000284770,ENSG00000285077,ENSG00000288701,ENSG00000288722
Goveia_Carmeliet_2020_patient_40_normal_adjacent_Goveia_Carmeliet_2020_patient_40,695.0,0.0,0.0,2622.0,0.0,687.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Goveia_Carmeliet_2020_patient_40_tumor_primary_Goveia_Carmeliet_2020_patient_40,343.0,0.0,0.0,628.0,0.0,288.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Goveia_Carmeliet_2020_patient_41_normal_adjacent_Goveia_Carmeliet_2020_patient_41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Goveia_Carmeliet_2020_patient_41_tumor_primary_Goveia_Carmeliet_2020_patient_41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Goveia_Carmeliet_2020_patient_42_normal_adjacent_Goveia_Carmeliet_2020_patient_42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zilionis_Klein_2019_p5t2_Zilionis_Klein_2019_patient_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zilionis_Klein_2019_p6t1_Zilionis_Klein_2019_patient_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zilionis_Klein_2019_p6t2_Zilionis_Klein_2019_patient_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zilionis_Klein_2019_p7t1_Zilionis_Klein_2019_patient_7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
bulk_df.to_csv(f"{path}/out/007_re_analysis/tables/input/counts.csv")    

In [45]:
### log1p_norm counts
pdata_log1p_norm = pd.DataFrame(pdata.layers["log1p_norm"], 
    pdata.to_df().index, 
    pdata.to_df().columns)
bulk_df = pdata_log1p_norm.T
#bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
bulk_df.to_csv(f"{path}/out/007_re_analysis/tables/input/log1p_norm_counts.csv")    

  bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")


### 0.4 Create counts and samplesheet independent for tumor and normal for DESEQ2 

In [46]:
pdata_normal = pdata[pdata.obs["origin"]=="normal_adjacent"]
pdata_tumor = pdata[pdata.obs["origin"]=="tumor_primary"]

In [47]:
pdata_normal

View of AnnData object with n_obs × n_vars = 111 × 9304
    obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'doublet_status', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'suspension_type', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    layers: 'counts', 'log1p_norm'

In [48]:
pdata_list = [pdata_normal,pdata_tumor]
pdata_list_name = ["normal","tumor"]

for index, dataframe in enumerate(pdata_list):
    print(pdata_list_name[index])
    # Perform the operations
    samplesheet = dataframe.obs.copy()
    samplesheet["sample"] = samplesheet.index
    samplesheet.reset_index(inplace=True)
  
    samplesheet.to_csv(f"{path}/out/007_re_analysis/tables/input/samplesheet_{pdata_list_name[index]}.csv", index=False)
    samplesheet.to_csv(f"{path}/out/007_re_analysis/tables/input/samplesheet_{pdata_list_name[index]}.tsv", index=False, sep="\t")


    ### Raw counts
    bulk_df = dataframe.to_df().T
    #bulk_df["gene_id"]=bulk_df.index
    bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
    bulk_df.to_csv(f"{path}/out/007_re_analysis/tables/input/counts_{pdata_list_name[index]}.csv")  
    ### log1p_norm counts
    pdata_log1p_norm = pd.DataFrame(dataframe.layers["log1p_norm"], 
        dataframe.to_df().index, 
        dataframe.to_df().columns)
    bulk_df = pdata_log1p_norm.T
    #bulk_df["gene_id"]=bulk_df.index
    bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
    bulk_df.to_csv(f"{path}/out/007_re_analysis/tables/input/log1p_norm_counts_{pdata_list_name[index]}.csv")    

normal


  bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
  bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")


tumor


  bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
  bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
