# 1.NSCLC Sex-stratified SC Analysis

## Data selection, pseudobulk, create input for DESEQ2

In [None]:
import warnings
import numpy as np
import pandas as pd 
import scanpy as sc
#import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
#import altair as alt
#import pertpy as pt
#from pandas.api.types import is_categorical_dtype
#from pandas import CategoricalDtype

In [None]:
import decoupler as dc

In [None]:
dc.__version__

In [None]:
path = "/data/projects/2023/LCBiome/nsclc_gender_atlas_tmp"
input_path    = f"{path}/data/local.h5ad"

In [None]:
adata = sc.read_h5ad(input_path) 

In [None]:
adata.obs.dataset.value_counts()

In [None]:
nsclc_chrom = pd.read_csv(f"{path}/out/007_re_analysis/tables/input/adata_var_nsclc_chrom.csv")

In [None]:
adata

In [None]:
adata.obs.disease.value_counts()

In [None]:
adata.obs.tumor_stage.value_counts()

In [None]:
adata.obs.sex.value_counts()

In [None]:
adata.obs.origin.value_counts()

In [None]:
adata.obs.dataset.value_counts()

### 01.SELECT ONLY male, female & tumor, normal_adjacent

In [None]:
adata = adata[(adata.obs["disease"] != "normal") & (adata.obs["disease"] != "chronic obstructive pulmonary disease")] #exclude normal and COPD samples

In [None]:
adata = adata[adata.obs["tumor_stage"].notna()] #exclude samples without tumor stage

In [None]:
adata = adata[(adata.obs["sex"]=="male")| (adata.obs["sex"]=="female")]

In [None]:
adata = adata[(adata.obs["origin"]=="tumor_primary")| (adata.obs["origin"]=="normal_adjacent")]

In [None]:
adata.obs.disease.value_counts()

In [None]:
adata.obs.tumor_stage.value_counts()

In [None]:
adata.obs.sex.value_counts()

In [None]:
adata.obs.origin.value_counts()

In [None]:
adata.obs.dataset.value_counts()

### 02.CELL TPYE RE-CLASIFICATION 

In [None]:
adata.obs.cell_type.value_counts()

In [None]:
adata.obs.cell_type_major.value_counts()

In [None]:
cells_of_interest=['alveolar macrophage', 'neutrophil',
 'macrophage','malignant cell',
 'B cell',
  'regulatory T cell',
'CD4-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta T cell',
 'neutrophils','epithelial cell of lung','multi-ciliated epithelial cell' ]
subset_adata = adata[adata.obs["cell_type"].isin(cells_of_interest)] 


In [None]:
set(subset_adata.obs.cell_type)

In [None]:
set(subset_adata.obs.cell_type_major)

In [None]:
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['epithelial cell of lung','multi-ciliated epithelial cell',], 'epithelial cell')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['alveolar macrophage'], 'macrophage')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['CD4-positive, alpha-beta T cell'], 'CD4+ T cell')
subset_adata.obs["cell_type"]= subset_adata.obs["cell_type"].replace(['CD8-positive, alpha-beta T cell'], 'CD8+ T cell')

In [None]:
set(subset_adata.obs.cell_type)

In [None]:
subset_adata

In [None]:
subset_adata.obs.disease.value_counts()

In [None]:
subset_adata.obs.dataset.value_counts()

In [None]:
subset_adata.obs.dataset.value_counts()

## PAIRED SAMPLES REMOVE NSCLC 

In [None]:
subset_adata = subset_adata[(subset_adata.obs["disease"]!="non-small cell lung carcinoma")]

In [None]:
subset_adata.obs.dataset.value_counts()

In [None]:
subset_adata = subset_adata[(subset_adata.obs["disease"]!="squamous cell lung carcinoma")]

In [None]:
subset_adata.obs.dataset.value_counts()

In [None]:
df_grouped = subset_adata.obs.groupby(["origin","donor_id"]).count()

In [None]:
df_grouped.reset_index(inplace= True)

In [None]:
df_grouped_normal = df_grouped[df_grouped["origin"]=="normal_adjacent"]

In [None]:
df_grouped_normal = df_grouped_normal[df_grouped_normal['sample'] != 0]

In [None]:
df_grouped_tumor = df_grouped[df_grouped["origin"]=="tumor_primary"]

In [None]:
df_grouped_tumor = df_grouped_tumor[df_grouped_tumor['sample'] != 0]

In [None]:
list(set(df_grouped_tumor.donor_id))

In [None]:
common_ids =list(set(df_grouped_normal.donor_id)& set(df_grouped_tumor.donor_id))

In [None]:
len(common_ids)

In [None]:
paired_adata = subset_adata[subset_adata.obs['donor_id'].isin(common_ids)].copy()

In [None]:
df = paired_adata.obs#
origin_counts = df.groupby('origin')['donor_id'].nunique().reset_index()
origin_counts.columns = ['origin', 'count']

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=origin_counts, x='origin', y='count', palette='viridis')
plt.xlabel('Origin')
plt.ylabel('Number of Unique Donor IDs')
plt.title('Number of Unique Donor IDs per Origin')
plt.show()

In [None]:
df = paired_adata.obs#
origin_counts = df.groupby('disease')['donor_id'].nunique().reset_index()
origin_counts.columns = ['disease', 'count']

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=origin_counts, x='disease', y='count', palette='viridis')
plt.xlabel('disease')
plt.ylabel('Number of Unique Donor IDs')
plt.title('Number of Unique Donor IDs per disease')
plt.show()

## SUMMARY 
- Filtered for male, female & tumor, normal_adjacent
- Filtered and reclassified cells of interest: 'B cell','CD4+ T cell', 'CD8+ T cell', 'epithelial cell', 'macrophage', 'malignant cell', 'neutrophil', 'regulatory Tcell'
- Filtered to exclude samples coming from NSCLC
- Filtered for PAIRED SAMPLES (82 donor_id that have normal_adjacent and tumor_primary  samples)
- KEEP LUADN AND LUSC

### 04.Compute pseudobulk

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    paired_adata,
    sample_col='sample',
    groups_col='donor_id',
    layer='count',
    min_cells=0,
    min_counts=0
)

In [None]:
#pdata_var_concat =pd.concat([pdata.var, nsclc_chrom],axis=1)

In [None]:
#pdata_var_concat

In [None]:
pdata.obs["age"] = pdata.obs["age"].astype("float64")
pdata.obs["is_primary_data"] = pdata.obs["is_primary_data"].astype(bool)


In [None]:
#pdata.write_h5ad(f"{path}/out/010_analysis_paired_include_guon/tables/input/pdata.h5ad")

In [None]:
#pdata = sc.read_h5ad(f"{path}/out/008_analysis_paired_luad/tables/input/pdata.h5ad")

In [None]:
pdata

In [None]:
dc.plot_psbulk_samples(pdata, groupby=['dataset', 'platform'], figsize=(12, 4))

In [None]:
#pdata = pdata[pdata.obs["dataset"]!="Guo_Zhang_2018"] ## SMART SEQ 

In [None]:
#pdata

In [None]:
#dc.plot_psbulk_samples(pdata, groupby=['dataset', 'platform'], figsize=(12, 4))

In [None]:
dc.plot_filter_by_expr(pdata, group='sample', min_count=10, min_total_count=15)

In [None]:
dc.plot_filter_by_expr(pdata, group='sample', large_n=100,min_prop=0.9, min_count=5000, min_total_count=10000)

I set this threshold for the DS analysis to avoid havinh zero-inflated data

In [None]:
# Obtain genes that pass the thresholds 
genes = dc.filter_by_expr(pdata, group='sample', min_count=5000, min_total_count=10000)

# Filter by these genes
pdata = pdata[:, genes].copy()
pdata

In [None]:
pdata.layers["counts"] = pdata.X

In [None]:
scales_counts = sc.pp.normalize_total(pdata, target_sum=None, inplace=False)
# log1p transform
pdata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [None]:
pdata

In [None]:
### Samplesheet
samplesheet = pdata.obs.copy()
samplesheet["sample"]=samplesheet.index
samplesheet.reset_index(inplace=True)

In [None]:
samplesheet.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/samplesheet.csv", index=False)
samplesheet.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/samplesheet.tsv", index=False, sep = "\t")

In [None]:
bulk_df = pdata.to_df().T

In [None]:
### Raw counts
bulk_df = pdata.to_df().T
#bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")

In [None]:
bulk_df.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/counts.csv")    

In [None]:
### log1p_norm counts
pdata_log1p_norm = pd.DataFrame(pdata.layers["log1p_norm"], 
    pdata.to_df().index, 
    pdata.to_df().columns)
bulk_df = pdata_log1p_norm.T
#bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
bulk_df.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/log1p_norm_counts.csv")    

### 0.4 Create counts and samplesheet independent for tumor and normal for DESEQ2 

In [None]:
pdata_normal = pdata[pdata.obs["origin"]=="normal_adjacent"]
pdata_tumor = pdata[pdata.obs["origin"]=="tumor_primary"]

In [None]:
pdata_normal

In [None]:
pdata_list = [pdata_normal,pdata_tumor]
pdata_list_name = ["normal","tumor"]

for index, dataframe in enumerate(pdata_list):
    print(pdata_list_name[index])
    # Perform the operations
    samplesheet = dataframe.obs.copy()
    samplesheet["sample"] = samplesheet.index
    samplesheet.reset_index(inplace=True)
  
    samplesheet.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/samplesheet_{pdata_list_name[index]}.csv", index=False)
    samplesheet.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/samplesheet_{pdata_list_name[index]}.tsv", index=False, sep="\t")


    ### Raw counts
    bulk_df = dataframe.to_df().T
    #bulk_df["gene_id"]=bulk_df.index
    bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
    bulk_df.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/counts_{pdata_list_name[index]}.csv")  
    ### log1p_norm counts
    pdata_log1p_norm = pd.DataFrame(dataframe.layers["log1p_norm"], 
        dataframe.to_df().index, 
        dataframe.to_df().columns)
    bulk_df = pdata_log1p_norm.T
    #bulk_df["gene_id"]=bulk_df.index
    bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
    bulk_df.to_csv(f"{path}/out/010_analysis_paired_include_guon/tables/input/log1p_norm_counts_{pdata_list_name[index]}.csv")    

## SUMMARY 
- Filtered for male, female & tumor, normal_adjacent
- Filtered and reclassified cells of interest: 'B cell','CD4+ T cell', 'CD8+ T cell', 'epithelial cell', 'macrophage', 'malignant cell', 'neutrophil', 'regulatory Tcell'
- Filtered to exclude samples coming from NSCLC
- Filtered for PAIRED SAMPLES (82 donor_id that have normal_adjacent and tumor_primary  samples)
 