# LIANA tumor vs normal core atlas v0

## Libraries

In [1]:
import numpy as  np
import pandas as pd
import scanpy as sc
import decoupler as dc
import liana as li
from liana.method import singlecellsignalr, connectome, cellphonedb, natmi, logfc, cellchat, geometric_mean
import sc_atlas_helpers as ah
from tqdm.auto import tqdm
import contextlib
import os
import statsmodels.stats.multitest
import numpy as np
from anndata import AnnData
import scipy.sparse

  from .autonotebook import tqdm as notebook_tqdm


## Define variables, paths and comparison tumor vs normal

In [2]:
comparison="tumor_normal"
subset = "core_atlas" 
cell_type_oi = "Cancer cell"
n_top_ligands = 30

In [3]:
perturbation = comparison.split("_")[0].upper()
baseline = comparison.split("_")[1].upper()
title_plot = f"{perturbation} vs {baseline}: {cell_type_oi}, top {n_top_ligands} DE ligands"
cell_type_oi = cell_type_oi.replace(" ","")
save_name_plot =  f"{perturbation}_vs_{baseline}_{cell_type_oi}_top_{n_top_ligands}_DE_ligands"

In [4]:
dataDir = "/data/projects/2022/CRCA/results/v1/downstream_analyses/Prepare_de_analysis/artifacts/"
resDir ="/data/projects/2022/CRCA/results/v1/final/liana_cell2cell/h5ads/updated/"

In [5]:
resDir

'/data/projects/2022/CRCA/results/v1/final/liana_cell2cell/h5ads/updated/'

In [6]:
adata = sc.read_h5ad("/data/projects/2022/CRCA/results/v1/downstream_analyses/Prepare_de_analysis/artifacts/paired_tumor_normal-adata.h5ad")

In [7]:
adata

AnnData object with n_obs × n_vars = 2332515 × 28476
    obs: 'dataset', 'medical_condition', 'cancer_type', 'sample_id', 'sample_type', 'tumor_source', 'replicate', 'sample_tissue', 'anatomic_region', 'anatomic_location', 'tumor_stage', 'tumor_stage_TNM', 'tumor_stage_TNM_T', 'tumor_stage_TNM_N', 'tumor_stage_TNM_M', 'tumor_size', 'tumor_dimensions', 'tumor_grade', 'histological_type', 'microsatellite_status', 'mismatch_repair_deficiency_status', 'MLH1_promoter_methylation_status', 'MLH1_status', 'KRAS_status', 'BRAF_status', 'APC_status', 'TP53_status', 'PIK3CA_status', 'SMAD4_status', 'NRAS_status', 'MSH6_status', 'FBXW7_status', 'NOTCH1_status', 'MSH2_status', 'PMS2_status', 'POLE_status', 'ERBB2_status', 'STK11_status', 'HER2_status', 'CTNNB1_status', 'BRAS_status', 'patient_id', 'sex', 'age', 'ethnicity', 'treatment_status_before_resection', 'treatment_drug', 'treatment_response', 'RECIST', 'platform', 'platform_fine', 'cellranger_version', 'reference_genome', 'matrix_type', 'enr

In [8]:
#adata_original = adata.copy()

In [9]:
adata.obs.sample_type.value_counts()

sample_type
tumor     1557400
normal     775115
Name: count, dtype: int64

In [10]:
adata.obs.cell_type_coarse.value_counts()

cell_type_coarse
Plasma_cell             362084
T_cell_CD8              355772
Cancer_non_stem_like    323487
T_cell_CD4              264011
Cancer_stem_like        251557
B_cell                  171676
Fibroblast              133488
Macrophage               86680
Endothelial_cell         73950
Granulocyte              72035
Treg                     62290
Pericyte                 44792
Monocyte                 44221
Dendritic_cell           27209
gamma_delta              19471
NK                       14523
Schwann_cell             13397
Tuft                      7040
ILC                       2541
Enteroendocrine           1909
NKT                        382
Name: count, dtype: int64

In [11]:
adata.obs.cell_type_middle.value_counts()

cell_type_middle
Cancer cell              386113
Plasma cell              362084
CD8                      355772
CD4                      264011
B cell                   171676
Fibroblast               133488
Epithelial cell          114527
Macrophage                86680
Endothelial cell          73950
Treg                      62290
Pericyte                  44792
Monocyte                  44221
Goblet                    38764
Epithelial progenitor     35640
Neutrophil                34652
Mast cell                 29639
Dendritic cell            27209
gamma-delta               19471
NK                        14523
Schwann cell              13397
Eosinophil                 7744
Tuft                       7040
ILC                        2541
Enteroendocrine            1909
NKT                         382
Name: count, dtype: int64

In [12]:
adata.obs.cell_type_fine.value_counts()

cell_type_fine
CD8                       349274
Plasma IgA                289016
CD4                       259607
Cancer Colonocyte-like    137590
Cancer TA-like            119040
B cell activated          101749
Colonocyte                 99415
Cancer Crypt-like          96877
Macrophage                 82153
Fibroblast S3              75590
Treg                       62290
Plasma IgG                 60684
Pericyte                   44792
Goblet                     38764
Neutrophil                 34274
B cell naive               34015
Endothelial arterial       33982
Endothelial venous         33713
Monocyte classical         32405
Cancer Goblet-like         30802
Mast cell                  29639
Fibroblast S1              29469
Fibroblast S2              28429
TA progenitor              26512
GC B cell                  23381
gamma-delta                19471
Colonocyte BEST4           15112
NK                         14523
Schwann cell               13397
B cell memory              1

In [13]:
set(adata.obs.cell_type_fine)


{'B cell activated',
 'B cell memory',
 'B cell naive',
 'CD4',
 'CD4 cycling',
 'CD8',
 'CD8 cycling',
 'Cancer BEST4',
 'Cancer Colonocyte-like',
 'Cancer Crypt-like',
 'Cancer Goblet-like',
 'Cancer TA-like',
 'Colonocyte',
 'Colonocyte BEST4',
 'Crypt cell',
 'DC mature',
 'DC3',
 'Endothelial arterial',
 'Endothelial lymphatic',
 'Endothelial venous',
 'Enteroendocrine',
 'Eosinophil',
 'Fibroblast S1',
 'Fibroblast S2',
 'Fibroblast S3',
 'GC B cell',
 'Goblet',
 'Granulocyte progenitor',
 'ILC',
 'Macrophage',
 'Macrophage cycling',
 'Mast cell',
 'Monocyte classical',
 'Monocyte non-classical',
 'Myeloid progenitor',
 'NK',
 'NKT',
 'Neutrophil',
 'Pericyte',
 'Plasma IgA',
 'Plasma IgG',
 'Plasma IgM',
 'Plasmablast',
 'Schwann cell',
 'TA progenitor',
 'Treg',
 'Tuft',
 'cDC1',
 'cDC2',
 'gamma-delta',
 'pDC'}

In [14]:
adata.obs['cell_type_fine'].replace({
    'Cancer BEST4': 'Cancer cell',
    'Cancer Colonocyte-like': 'Cancer cell',
    'Cancer Crypt-like': 'Cancer cell',
    'Cancer Goblet-like': 'Cancer cell',
    'Cancer TA-like': 'Cancer cell'
}, inplace=True)

In [15]:
set(adata.obs.cell_type_fine)


{'B cell activated',
 'B cell memory',
 'B cell naive',
 'CD4',
 'CD4 cycling',
 'CD8',
 'CD8 cycling',
 'Cancer cell',
 'Colonocyte',
 'Colonocyte BEST4',
 'Crypt cell',
 'DC mature',
 'DC3',
 'Endothelial arterial',
 'Endothelial lymphatic',
 'Endothelial venous',
 'Enteroendocrine',
 'Eosinophil',
 'Fibroblast S1',
 'Fibroblast S2',
 'Fibroblast S3',
 'GC B cell',
 'Goblet',
 'Granulocyte progenitor',
 'ILC',
 'Macrophage',
 'Macrophage cycling',
 'Mast cell',
 'Monocyte classical',
 'Monocyte non-classical',
 'Myeloid progenitor',
 'NK',
 'NKT',
 'Neutrophil',
 'Pericyte',
 'Plasma IgA',
 'Plasma IgG',
 'Plasma IgM',
 'Plasmablast',
 'Schwann cell',
 'TA progenitor',
 'Treg',
 'Tuft',
 'cDC1',
 'cDC2',
 'gamma-delta',
 'pDC'}

## Pseudobulk

In [16]:
## Filter adata for sample_type only tumor & normal 
adata = adata[adata.obs.sample_type.isin(["tumor","normal"])]

In [17]:
adata.obs["cell_type_new"] =  "epithelial_cancer"



In [18]:
## Filter adata to have paired samples only 
#filtered_indices = adata.obs.groupby('patient_id').filter(lambda x: len(set(x['sample_type'])) >= 2).index
#adata = adata[filtered_indices] #paired data

In [19]:
# Pseudobulk 
#groups_col ="sample_type" # tumor vs normal 
#sample_col="sample_id" 
#layer="counts"
#pseudobulk = [
#    (
#        group,
#        dc.get_pseudobulk(
#            adata[adata.obs[groups_col] == group],
#            sample_col=sample_col,
#            groups_col=[groups_col,"cell_type_fine","patient_id","dataset"],
#            layer=layer,
#            mode="sum",
#            min_prop=0.05,
#            min_cells=10,
#            min_counts=500,
#            min_smpls=10,
#        ),
#    )
#    for group in adata.obs[groups_col].unique()
#]

In [20]:
## Create count matrix and samplesheet for each sample_type: tumor & normal 
#for group, pdata in pseudobulk:
#    group = group.replace(" ","_")
#    if pdata.obs["sample_id"].nunique() <= 5:
#        print(f"Cell type {group} does not have samples in all groups")
#        break
#    else:
#        pdata.var_names.name = "gene_id"
#
#        colData = pdata.obs
#        colData.index.name = "sample_col"
#
#        colData.to_csv(f"{resDir}/02_pseudobulk/{group}_colData.csv")
#        rowData = pdata.var[["Geneid", "GeneSymbol", "Chromosome", "Class", "Length"]]
#        rowData.to_csv(f"{resDir}/02_pseudobulk/{group}_rowData.csv")
#        count_mat = pdata.to_df().T
#        count_mat.index.name = "gene_id"
#        count_mat.to_csv(f"{resDir}/02_pseudobulk/{group}_count_mat.csv")

## LIANA- rank agregate

In [None]:
adata.layers["log1p_norm"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e6, layer="log1p_norm")
sc.pp.log1p(adata, base=6, layer="log1p_norm")

In [None]:
# Run rank_aggregate 
li.mt.rank_aggregate(adata, groupby='cell_type_fine', expr_prop=0.1,resource_name='consensus',  verbose=True,key_added='rank_aggregate', n_jobs=6, layer = "log1p_norm", use_raw = False)

In [None]:
#Save adata with new ranked information
adata.write_h5ad(f"{resDir}/adata_rank_agregate.h5ad")