# 1.NSCLC Sex-stratified SC Analysis

## Data selection, pseudobulk, create input for DESEQ2

In [1]:
import warnings
import numpy as np
import pandas as pd 
import scanpy as sc
#import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
#import altair as alt
#import pertpy as pt
#from pandas.api.types import is_categorical_dtype
#from pandas import CategoricalDtype

In [2]:
import decoupler as dc

In [3]:
#Set result directory for figures
resDir = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/011_analysis_paired_remove_xy/figures/"
sc.settings.figdir = resDir

In [4]:
dc.__version__

'1.5.0'

In [5]:
path = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp"
input_path    = f"{path}/data/local.h5ad"

In [None]:
adata = sc.read_h5ad(input_path) 

In [None]:
adata.obs.dataset.value_counts()

In [None]:
nsclc_chrom = pd.read_csv(f"{path}/out/007_re_analysis/tables/input/adata_var_nsclc_chrom.csv")

In [None]:
adata

In [None]:
adata.obs.disease.value_counts()

In [None]:
adata.obs.tumor_stage.value_counts()

In [None]:
adata.obs.sex.value_counts()

In [None]:
adata.obs.origin.value_counts()

In [None]:
adata.obs.dataset.value_counts()

### 01.SELECT ONLY male, female & tumor, normal_adjacent

In [None]:
adata = adata[(adata.obs["disease"] != "normal") & (adata.obs["disease"] != "chronic obstructive pulmonary disease")] #exclude normal and COPD samples

In [None]:
adata = adata[adata.obs["tumor_stage"].notna()] #exclude samples without tumor stage

In [None]:
adata = adata[(adata.obs["sex"]=="male")| (adata.obs["sex"]=="female")]

In [None]:
adata = adata[(adata.obs["origin"]=="tumor_primary")| (adata.obs["origin"]=="normal_adjacent")]

In [None]:
adata.obs.disease.value_counts()

In [None]:
adata.obs.tumor_stage.value_counts()

In [None]:
adata.obs.sex.value_counts()

In [None]:
adata.obs.origin.value_counts()

In [None]:
adata.obs.dataset.value_counts()

### 02.CELL TPYE RE-CLASIFICATION 

In [None]:
adata.obs.cell_type.value_counts()

In [None]:
adata.obs.cell_type_major.value_counts()

In [None]:
cells_of_interest=['alveolar macrophage', 'neutrophil','dendritic cell',
 'macrophage','malignant cell',
 'B cell',
  'regulatory T cell',
'CD4-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta T cell',
 'neutrophils','epithelial cell of lung','multi-ciliated epithelial cell' ]
subset_adata = adata[adata.obs["cell_type"].isin(cells_of_interest)] 


In [None]:
set(subset_adata.obs.cell_type)

In [None]:
set(subset_adata.obs.cell_type_major)

In [None]:
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['epithelial cell of lung','multi-ciliated epithelial cell',], 'epithelial cell')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['alveolar macrophage'], 'macrophage')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['CD4-positive, alpha-beta T cell'], 'CD4+ T cell')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['CD8-positive, alpha-beta T cell'], 'CD8+ T cell')

In [None]:
set(subset_adata.obs.cell_type)

In [None]:
subset_adata

In [None]:
subset_adata.obs.disease.value_counts()

In [None]:
subset_adata.obs.dataset.value_counts()

In [None]:
subset_adata.obs.dataset.value_counts()

## PAIRED SAMPLES

In [None]:
df_grouped = subset_adata.obs.groupby(["origin","donor_id"]).count()

In [None]:
df_grouped.reset_index(inplace= True)

In [None]:
df_grouped_normal = df_grouped[df_grouped["origin"]=="normal_adjacent"]

In [None]:
df_grouped_normal = df_grouped_normal[df_grouped_normal['sample'] != 0]

In [None]:
df_grouped_tumor = df_grouped[df_grouped["origin"]=="tumor_primary"]

In [None]:
df_grouped_tumor = df_grouped_tumor[df_grouped_tumor['sample'] != 0]

In [None]:
df_grouped_tumor

In [None]:
common_ids =list(set(df_grouped_normal.donor_id)& set(df_grouped_tumor.donor_id))

In [None]:
len(common_ids)

In [None]:
paired_adata = subset_adata[subset_adata.obs['donor_id'].isin(common_ids)].copy()

In [None]:
paired_adata

In [None]:
df = paired_adata.obs#
origin_counts = df.groupby('origin')['donor_id'].nunique().reset_index()
origin_counts.columns = ['origin', 'count']

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=origin_counts, x='origin', y='count', palette='viridis')
plt.xlabel('Origin')
plt.ylabel('Number of Unique Donor IDs')
plt.title('Number of Unique Donor IDs per Origin')
plt.show()

In [None]:
df = paired_adata.obs#
origin_counts = df.groupby('disease')['donor_id'].nunique().reset_index()
origin_counts.columns = ['disease', 'count']

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=origin_counts, x='disease', y='count', palette='viridis')
plt.xlabel('disease')
plt.ylabel('Number of Unique Donor IDs')
plt.title('Number of Unique Donor IDs per disease')
plt.show()

In [None]:
paired_adata.obs.disease.value_counts()

In [None]:
paired_adata.obs.tumor_stage.value_counts()

In [None]:
paired_adata.obs.sex.value_counts()

In [None]:
paired_adata.obs.origin.value_counts()

In [None]:
paired_adata.obs.dataset.value_counts()

In [None]:
paired_adata

## Create layer Log 1pnorm 

In [None]:
paired_adata.layers["log1p_norm"] = paired_adata.X.copy()
sc.pp.normalize_total(paired_adata, target_sum=1e6, layer="log1p_norm")
sc.pp.log1p(paired_adata, base=6, layer="log1p_norm")

In [None]:
paired_adata

## Checking for XIST & RPS4Y1 in male and female

In [None]:
paired_adata.var_names = paired_adata.var.feature_name

In [None]:
# Now, generate the plot and save it
sc.pl.violin(paired_adata, ["RPS4Y1", "XIST"], use_raw=False, groupby="sex", layer="log1p_norm")#, save="_xy_check.png")

In [None]:
rps4y1_cells = paired_adata[:, paired_adata.var_names == "RPS4Y1"].X > 0
# Subset the adata object to only those cells
rps4y1_paired_adata_subset = paired_adata[rps4y1_cells, :]

In [None]:
rps4y1_paired_adata_subset.obs.sex.value_counts()

In [None]:
xist_cells = paired_adata[:, paired_adata.var_names == "XIST"].X > 0
# Subset the adata object to only those cells
xist_paired_adata_subset = paired_adata[xist_cells, :]

In [None]:
xist_paired_adata_subset.obs.sex.value_counts()

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
rps4y1_paired_adata_subset.obs.groupby(["sex","donor_id"]).count()

In [None]:
xist_paired_adata_subset.obs.groupby(["sex","donor_id"]).count()

These patients express X and Y realted chromosomes oposite to what they are labelled. 

* rps4y1 "Lambrechts_Thienpont_2018_6653_8", "Leader_Merad_2021_406"
* xist "Lambrechts_Thienpont_2018_6653_6", "Leader_Merad_2021_581"
- Option 1. Relabel as oposite (male & female) 
- Option 2. Remove to avoid mistakes




## Remove these donor id 

In [None]:
paired_adata_clean = paired_adata[~paired_adata.obs["donor_id"].isin(["Lambrechts_Thienpont_2018_6653_8", "Leader_Merad_2021_406", "Lambrechts_Thienpont_2018_6653_6", "Leader_Merad_2021_581"])]

In [None]:
paired_adata

In [None]:
paired_adata_clean

In [None]:
rps4y1_cells = paired_adata_clean[:, paired_adata_clean.var_names == "RPS4Y1"].X > 0

# Subset the adata object to only those cells
rps4y1_paired_adata_subset = paired_adata_clean[rps4y1_cells, :]

In [None]:
rps4y1_paired_adata_subset.obs.sex.value_counts()

In [None]:
xist_cells = paired_adata_clean[:, paired_adata_clean.var_names == "XIST"].X > 0

# Subset the adata object to only those cells
xist_paired_adata_subset = paired_adata_clean[xist_cells, :]

In [None]:
xist_paired_adata_subset.obs.sex.value_counts()

In [None]:
rps4y1_paired_adata_subset.obs.groupby(["sex","donor_id"]).count()

In [None]:
xist_paired_adata_subset.obs.groupby(["sex","donor_id"]).count()

In [None]:
# Now, generate the plot and save it
sc.pl.violin(paired_adata_clean, ["RPS4Y1", "XIST"], use_raw=False, groupby="sex", layer="log1p_norm", save="_xy_check.png")

## Check X and Y chromosome related genes

In [None]:
x_genes =  ["XIST","MECP2","DMD","FMR1","G6PD"]
y_genes = ["SRY","TSPY","DAZ","RPS4Y1","RPS4Y2","ZFY","UTY"]

In [None]:
sc.pl.violin(paired_adata_clean, x_genes, use_raw=False, groupby="sex", layer="log1p_norm")

In [None]:
sc.pl.violin(paired_adata_clean, y_genes, use_raw=False, groupby="sex", layer="log1p_norm")

## Remove XIST & RPS4Y1 genes 

Not sure

SRY – Sex-determining region Y: Critical for male sex determination.
TSPY – Testis-specific protein Y-encoded: Expressed in the testis.
DAZ – Deleted in azoospermia: Involved in spermatogenesis.
RPS4Y1 – Ribosomal protein S4, Y-linked 1: Y-linked variant of the ribosomal protein S4.
RPS4Y2 – Ribosomal protein S4, Y-linked 2.
ZFY – Zinc finger protein, Y-linked.
UTY – Ubiquitously transcribed tetratricopeptide repeat containing Y-linked.

XIST – X-inactive specific transcript: Involved in X chromosome inactivation in females.
MECP2 – Methyl CpG binding protein 2: Mutations can cause Rett syndrome.
DMD – Dystrophin: Mutations cause Duchenne muscular dystrophy.
FMR1 – Fragile X mental retardation 1: Associated with Fragile X syndrome.
G6PD – Glucose-6-phosphate dehydrogenase: Mutations cause G6PD deficiency, leading to hemolytic anemia.

In [None]:
#genes_to_remove = ['XIST', 'RPS4Y1']
#paired_adata_clean = paired_adata_clean[:, ~paired_adata_clean.var_names.isin(genes_to_remove)].copy()

## SUMMARY 
- We used the sex annotation that was given in the respective datasets
- These annotations were confirmed by our analysis, as no Ygenes were in the female cohort. 

### 04.Compute pseudobulk

In [None]:
path = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp/out/011_analysis_paired_remove_xy/pseudobulk/"

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    paired_adata_clean,
    sample_col='sample',
    groups_col='donor_id',
    layer='count',
    min_cells=0,
    min_counts=0
)

In [None]:
pdata.obs["age"] = pdata.obs["age"].astype("float64")
pdata.obs["is_primary_data"] = pdata.obs["is_primary_data"].astype(bool)


In [None]:
#pdata.write_h5ad(f"{path}/out/010_analysis_paired_include_guon/tables/input/pdata.h5ad")

In [None]:
#pdata = sc.read_h5ad(f"{path}/out/008_analysis_paired_luad/tables/input/pdata.h5ad")

In [None]:
pdata

In [None]:
dc.plot_psbulk_samples(pdata, groupby=['dataset', 'platform'], figsize=(12, 4))

In [None]:
#pdata = pdata[pdata.obs["dataset"]!="Guo_Zhang_2018"] ## SMART SEQ 

In [None]:
#pdata

In [None]:
#dc.plot_psbulk_samples(pdata, groupby=['dataset', 'platform'], figsize=(12, 4))

In [None]:
dc.plot_filter_by_expr(pdata, group='sample', min_count=10, min_total_count=15)

In [None]:
dc.plot_filter_by_expr(pdata, group='sample', large_n=100,min_prop=0.9, min_count=5000, min_total_count=10000)

I set this threshold for the DS analysis to avoid havinh zero-inflated data

In [None]:
# Obtain genes that pass the thresholds 
genes = dc.filter_by_expr(pdata, group='sample', min_count=5000, min_total_count=10000)

# Filter by these genes
pdata = pdata[:, genes].copy()
pdata

In [None]:
pdata.layers["counts"] = pdata.X

In [None]:
scales_counts = sc.pp.normalize_total(pdata, target_sum=None, inplace=False)
# log1p transform
pdata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [None]:
pdata

In [None]:
### Samplesheet
samplesheet = pdata.obs.copy()
samplesheet["sample"]=samplesheet.index
samplesheet.reset_index(inplace=True)

In [None]:
samplesheet.to_csv(f"{path}samplesheet.csv", index=False)
samplesheet.to_csv(f"{path}samplesheet.tsv", index=False, sep = "\t")

In [None]:
bulk_df = pdata.to_df().T

In [None]:
### Raw counts
bulk_df = pdata.to_df().T
#bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")

In [None]:
bulk_df.to_csv(f"{path}counts.csv")    

In [None]:
### log1p_norm counts
pdata_log1p_norm = pd.DataFrame(pdata.layers["log1p_norm"], 
    pdata.to_df().index, 
    pdata.to_df().columns)
bulk_df = pdata_log1p_norm.T
#bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
bulk_df.to_csv(f"{path}log1p_norm_counts.csv")    

### 0.4 Create counts and samplesheet independent for tumor and normal for DESEQ2 

In [None]:
pdata_normal = pdata[pdata.obs["origin"]=="normal_adjacent"]
pdata_tumor = pdata[pdata.obs["origin"]=="tumor_primary"]

In [None]:
pdata_normal

In [None]:
pdata_list = [pdata_normal,pdata_tumor]
pdata_list_name = ["normal","tumor"]

for index, dataframe in enumerate(pdata_list):
    print(pdata_list_name[index])
    # Perform the operations
    samplesheet = dataframe.obs.copy()
    samplesheet["sample"] = samplesheet.index
    samplesheet.reset_index(inplace=True)
  
    samplesheet.to_csv(f"{path}samplesheet_{pdata_list_name[index]}.csv", index=False)
    samplesheet.to_csv(f"{path}samplesheet_{pdata_list_name[index]}.tsv", index=False, sep="\t")


    ### Raw counts
    bulk_df = dataframe.to_df().T
    #bulk_df["gene_id"]=bulk_df.index
    bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
    bulk_df.to_csv(f"{path}counts_{pdata_list_name[index]}.csv")  
    ### log1p_norm counts
    pdata_log1p_norm = pd.DataFrame(dataframe.layers["log1p_norm"], 
        dataframe.to_df().index, 
        dataframe.to_df().columns)
    bulk_df = pdata_log1p_norm.T
    #bulk_df["gene_id"]=bulk_df.index
    bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
    bulk_df.to_csv(f"{path}/log1p_norm_counts_{pdata_list_name[index]}.csv")    