# LIANA tumor vs normal core atlas

## Libraries

In [1]:
import numpy as  np
import pandas as pd
import scanpy as sc
import decoupler as dc

# import liana
import liana as li
from liana.method import singlecellsignalr, connectome, cellphonedb, natmi, logfc, cellchat, geometric_mean

import sc_atlas_helpers as ah
from scanpy_helper_submodule import scanpy_helpers as sh

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Define paths

In [2]:
# Core atlas
adata =sc.read_h5ad("/data/projects/2022/CRCA/results/v1/artifacts/build_atlas/adata_tumor_strat_PB.h5ad")

## Define comparison: tumor vs normal

In [3]:
comparison="tumor_normal" #//immune_type
subset = "core_atlas" #//neutrophil_subclusters

In [4]:
cell_type_oi = "Cancer cell"
n_top_ligands = 30

In [5]:
resDir = f"/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/{subset}/{comparison}"

In [6]:
resDir

'/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/tumor_normal'

In [7]:
if comparison =="immune_type":
    title_plot = f"Patient stratification: {cell_type_oi}, top {n_top_ligands} DE ligands"
    save_name_plot =  f"patient_stratification_{cell_type_oi}_top_{n_top_ligands}_DE_ligands"
elif comparison=="tumor_normal":
    perturbation = comparison.split("_")[0].upper()
    baseline = comparison.split("_")[1].upper()
    title_plot = f"{perturbation} vs {baseline}: {cell_type_oi}, top {n_top_ligands} DE ligands"
    cell_type_oi = cell_type_oi.replace(" ","")
    save_name_plot =  f"{perturbation}_vs_{baseline}_{cell_type_oi}_top_{n_top_ligands}_DE_ligands"

## Pseudobulk

### Comparison: tumor vs normal

In [8]:
## Filter adata for sample_type only tumor & normal 
adata = adata[adata.obs.sample_type.isin(["tumor","normal"])]
adata = adata[adata.obs.cell_type.isin(["Epithelial","Cancer cell"])]


In [9]:
adata.obs["cell_type_new"] =  "epithelial_cancer"



In [10]:
set(adata.obs.cell_type_new)

{'epithelial_cancer'}

In [11]:
adata_original = adata.copy()

In [12]:
## Filter adata to have paired samples only 
filtered_indices = adata.obs.groupby('patient_id').filter(lambda x: len(set(x['sample_type'])) >= 2).index
adata = adata[filtered_indices] #paired data



In [None]:
# Pseudobulk by sample type 
groups_col ="cell_type_new" # tumor vs normal 
sample_col="sample_id" 
layer="counts"
pseudobulk = [
    (
        group,
        dc.get_pseudobulk(
            adata[adata.obs[groups_col] == group],
            sample_col=sample_col,
            groups_col=[groups_col,"cell_type","patient_id","dataset"],
            layer=layer,
            mode="sum",
            min_prop=0.05,
            min_cells=10,
            min_counts=500,
            min_smpls=10,
        ),
    )
    for group in adata.obs[groups_col].unique()
]

In [None]:
## Create count matrix and samplesheet for each sample_type: tumor & normal 
## Filter only for cell_type: Epithelial & Cancer cell 
for group, pdata in pseudobulk:
    group = group.replace(" ","_")
    if pdata.obs["sample_id"].nunique() <= 5:
        print(f"Cell type {group} does not have samples in all groups")
        break
    else:
      # Reorder by adata.var (i.e. chromosome position) and reset index to get actual gene names (without appended suffix)
     
       # pdata = pdata[pdata.obs.cell_type.isin(["Epithelial","Cancer cell"])]

        pdata.var_names.name = "gene_id"

        colData = pdata.obs
        colData.index.name = "sample_col"

        colData.to_csv(f"{resDir}/epithelial_cancer/02_pseudobulk/{group}_colData.csv")
        rowData = pdata.var[["Geneid", "GeneSymbol", "Chromosome", "Class", "Length"]]
        rowData.to_csv(f"{resDir}/epithelial_cancer/02_pseudobulk/{group}_rowData.csv")
        count_mat = pdata.to_df().T
        count_mat.index.name = "gene_id"
        count_mat.to_csv(f"{resDir}/epithelial_cancer/02_pseudobulk/{group}_count_mat.csv")


In [None]:
"/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/tumor_normal/epithelial_cancer/02_pseudobulk"

In [None]:
# Merge count matrix to have Eptihelial & Cancer cell & sample type tumor & normal
#df_tumor = pd.read_csv(os.path.join(resDir,'/epithelial_cancer/pseudobulk/', f"{perturbation}_count_mat.csv"))
#df_normal = pd.read_csv(os.path.join(resDir,'/epithelial_cancer/pseudobulk/', f"{baseline}_count_mat.csv"))

#df_normal_tumor = df_normal.merge(df_tumor, how='left', on='gene_id')


## Merge samplesheet (colData)
#col_normal = pd.read_csv(os.path.join(resDir,'/epithelial_cancer/pseudobulk/',  f"{perturbation}_colData.csv"))
#col_tumor = pd.read_csv(os.path.join(resDir,'/epithelial_cancer/pseudobulk/',  f"{perturbation}_tumor_colData.csv"))
#col_normal_tumor = pd.concat([col_normal, col_tumor])

In [None]:
# Write into csv merged files for DESEQ2 comparison

#col_normal_tumor.to_csv(f"{resDir}/epithelial_cancer/pseudobulk/merged_colData.csv")
#df_normal_tumor.to_csv(f"{resDir}/epithelial_cancer/pseudobulk/count_mat.csv", index = False)

In [None]:
#Check if nan in count matrix 
#df_normal_tumor.isna().sum().sum()

In [None]:
#df_normal_tumor.shape

## TUMOR vs NORMAL

DeSeq2 script: "/data/scratch/kvalem/projects/2022/differential_gene_expression/bin/03_DESeq2_DGEA_studio.R"

### Parameters for DeSeq2 
- input: colData, count_mat, rowData
- covariate_formula = "patient_id +"
- sample_col="sample_col" 
- cond_col="sample_type"
- sum2zero=FALSE 
- c1="tumor" 
- c2="normal"
- cpus=8

In [17]:
# DESEQ2 output path 
deseq2_path_prefix = "/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/tumor_normal/epithelial_cancer/03_deseq2"

In [18]:
file_name_deseq2_out = "Cancer_Epithelial_tumor_vs_normal_DESeq2_result.tsv"

In [19]:
de_res = (
    pd.read_csv(f"{deseq2_path_prefix}/{file_name_deseq2_out}",
        sep="\t",
    )
    .fillna(1)
    .pipe(sh.util.fdr_correction)
    .rename(columns={"comparison": "group"})
)

## LIANA- rank agregate

### NEUTROPHILS

In [None]:
# Run rank_aggregate for neutrophil
#li.mt.rank_aggregate(adata_n, groupby='cell_type', expr_prop=0.1,resource_name='consensus',  verbose=True,key_added='rank_aggregate', layer = "log1p_norm", use_raw = False)

In [None]:
#adata.write_h5ad("/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/neutrophil_subclusters/adata_rank_agregate_neutrophil.h5ad")

In [None]:
# rank agregate for neutrophils
#adata_n = sc.read_h5ad(f"/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/neutrophil_subclusters/adata_rank_agregate_neutrophil.h5ad") 

###  CORE ATLAS 

In [None]:
# Run rank_aggregate
#li.mt.rank_aggregate(adata, groupby='cell_type', expr_prop=0.1,resource_name='consensus',  verbose=True,key_added='rank_aggregate', layer = "log1p_norm", use_raw = False)

In [None]:
#adata.write_h5ad("/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/adata_rank_agregate.h5ad")

In [20]:
# rank agregate for core atlas 
adata = sc.read_h5ad(f"/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/adata_rank_agregate.h5ad") 

In [21]:
cell_type_oi = "Cancer cell"
n_top_ligands = 30

In [22]:
immune_cells =['B cell',
 'DC mature',
 'Macrophage',
 'Mast cell',
 'Monocyte',
 'NK',
 'Neutrophil',
 'Plasma cell',
 'T cell CD4',
 'T cell CD8',
 'T cell regulatory',
 'cDC',
 'pDC']

In [23]:
immune_cells_cancer =['Cancer cell','B cell',
 'DC mature',
 'Macrophage',
 'Mast cell',
 'Monocyte',
 'NK',
 'Neutrophil',
 'Plasma cell',
 'T cell CD4',
 'T cell CD8',
 'T cell regulatory',
 'cDC',
 'pDC']

In [24]:
#result of `significant_interactions`. May be further filtered or modified.
cpdb_res = adata.uns['rank_aggregate'].loc[
        lambda x: x["specificity_rank"] <= 0.01
    ]

In [25]:
# rename columns in liana results 
cpdb_res=cpdb_res.rename(columns={"ligand_complex":"source_genesymbol","receptor_complex":"target_genesymbol"})

In [26]:
# use scanpy helper class CpdbAnalysis to compute pseudobulk, cell fraction and 
cpdba = sh.cell2cell.CpdbAnalysis(
    cpdb_res,
    adata,
    pseudobulk_group_by=["patient_id"],
    cell_type_column="cell_type"
)

In [37]:
cpdb_sig_int = cpdba.significant_interactions(
    de_res, max_pvalue=0.1
)

In [38]:
## This is input for CIRCOS PLOT 
cpdb_sig_int.to_csv(f"/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/tumor_normal/epithelial_cancer.csv")

In [39]:
cpdb_sig_int = cpdb_sig_int.loc[lambda x: x["cell_type"].isin(immune_cells)]

In [40]:
top_genes = (
    cpdb_sig_int.loc[:, ["source_genesymbol", "fdr"]]
    .drop_duplicates()
    .sort_values("fdr")["source_genesymbol"][:30]
    .tolist()
)

In [44]:
title_plot = f"{perturbation} vs {baseline}: {cell_type_oi}, FDR<0.1"

In [50]:
save_name_plot =  f"{perturbation}_vs_{baseline}_{cell_type_oi}_fdr_0.1"

In [49]:
heatmap = cpdba.plot_result(
    cpdb_sig_int.loc[lambda x: x["source_genesymbol"].isin(top_genes)],
    title=title_plot,
    aggregate=False,
    cluster="heatmap",
    label_limit=110,
)
heatmap


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [51]:
heatmap.save(f'{resDir}/figures/{save_name_plot}.png')
heatmap.save(f'{resDir}/figures/{save_name_plot}.svg')
heatmap.save(f'{resDir}/figures/{save_name_plot}.pdf')

## CIRCOS PLOT 

## This is input for CIRCOS PLOT 
input = "/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/tumor_normal/epithelial_cancer.csv"

Circosp plot script  "/data/scratch/kvalem//projects/2022/crc-atlas/analyses/05_Liana/circosplot.Rmd"

In [52]:
resDir

'/data/projects/2022/CRCA/results/v0.1/crc-atlas-dataset/latest/ds_analyses/liana_cell2cell/core_atlas/tumor_normal'