# Notebook to preprocess used datasets (ESCC, CRC, breast, LUAD)

In [None]:
import os
import sys
import scanpy as sc
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.append('..')
from constants import BASE_PATH_DATA

sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)


In [None]:
from scipy.sparse import diags

def shifted_transformation(adata, y0=1):
    """
    From Twitter post https://twitter.com/Sanbomics/status/1647654042749874177?s=20
    Refering to publication by Ahlmann-Eltze & Huber.
    
    Ahlmann-Eltze, C., Huber, W. Comparison of transformations for single-cell RNA-seq data. 
    Nat Methods (2023). https://doi.org/10.1038/s41592-023-01814-1
    """
    target_sum = np.mean(adata.X.sum(axis=1))
    print(f'Mean shift logarithm normalization with normalization target count {target_sum}')
    size_factors = adata.X.sum(axis=1) / target_sum
    
    adata.X = diags(1 / size_factors.A1).dot(adata.X)
    adata.X.data = np.log(adata.X.data + y0)
    adata.uns["log1p"] = {"base": None}
    
    return adata

In [None]:
def filtergenes(adata, pct=0.01):
    # remove genes that are not present in at least 1% of all cells
    nr_cells, nr_genes = adata.shape    
    gene_expr_in_cells_cnts = adata.X.getnnz(axis=0)
    enough_genes  = gene_expr_in_cells_cnts - nr_cells * pct
    print(f'Filtering {np.sum(enough_genes < 0)} of {nr_genes} genes'
          f'({np.round((np.sum(enough_genes < 0))/nr_genes *100,decimals=2)}%).')
    adata = adata[:, enough_genes >= 0].copy()
    return adata

In [None]:
def preprocess_data(adata, filter_genes=True, shift_method='mean'):
    if filter_genes:
        # Since we removed cells we need to refilter the genes, as they are filtered based on the
        # percentage of available cells in the data
        adata = filtergenes(adata)

    if shift_method == 'median':
        sc.pp.normalize_total(adata)
        sc.pp.log1p(adata)
        adata.uns['log1p']['base'] = None
    elif shift_method == 'mean':
        adata = shifted_transformation(adata)
    elif shift_method == 'CP10k':
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        adata.uns['log1p']['base'] = None
    else:
        raise ValueError('Unknown shift transformation method! Can choose between mean, median, CP10k.')

    return adata

In [None]:
def preprocess_dataset(adata, filter_genes=True, shift_method='mean', sample_based=False, sample_col='sample_id'):
    # print input configuration
    print(f'filter_genes={filter_genes}, shift_method={shift_method}, sample_based={sample_based}')
    # remove cells that were undecided in malignancy from CanSig pipeline
    adata = adata[adata.obs.malignant_key !='undecided',:].copy()
    adata.obs.malignant_key = adata.obs.malignant_key.astype('category')
    
    adata.layers["counts"] = adata.X
    
    if sample_based:
        adatas = {}
        for group in adata.obs.groupby(sample_col):
            adatas[group[0]] = adata[group[1].index,].copy()
        del adata
        for key, curr_adata in adatas.items():
            adatas[key] = preprocess_data(curr_adata, filter_genes, shift_method)
        
        adata = sc.concat(list(adatas.values()), join='outer', merge='first')
        del adatas   
    else:
        adata = preprocess_data(adata, filter_genes, shift_method)
    
    adata.var.mt = adata.var.mt.astype(bool)
    adata.var.cnv_called = adata.var.cnv_called.astype(bool)
    
    return adata

### Global variables

In [None]:
norm_method='CP10k' # alternatives '' for mean and 'median' for median 
sample_based=True # alternatives False for computation on entire dataset 

if norm_method=='median':
    appendix = '_med_per_sid' if sample_based else '_med'
elif norm_method=='CP10k':
    appendix = '_cp10k_per_sid' if sample_based else '_cp10k'
else:
    appendix = '_per_sid' if sample_based else ''
    
print(appendix)

##  ESCC
The esophageal dataset published by Zhang et al. [1] was first preprocessed with the CanSig pipeline [2]. We further filter genes (a gene needs to be expressed in at least 1% of the cells) and normalize the data with the shifted logarithm as described by Ahlmann-Eltze and Huber [3] and implemented in this post [4].   


[1] Zhang, X., Peng, L., Luo, Y. et al. Dissecting esophageal squamous-cell carcinoma ecosystem by single-cell transcriptomic analysis. Nat Commun 12, 5291 (2021). https://doi.org/10.1038/s41467-021-25539-x

[2] CanSig: Discovering de novo shared transcriptional programs in single cancer cells
Josephine Yates, Florian Barkmann, Paweł Czyż, Marc Glettig, Frederieke Lohmann, Richard von der Horst, Elia Saquand, Nicolas Volken, Agnieszka Kraft, Valentina Boeva, bioRxiv 2022.04.14.488324; doi: https://doi.org/10.1101/2022.04.14.488324 

[3] Ahlmann-Eltze, C., Huber, W. Comparison of transformations for single-cell RNA-seq data. Nat Methods (2023). https://doi.org/10.1038/s41592-023-01814-1

[4] https://twitter.com/Sanbomics/status/1647654042749874177?s=20

In [None]:
fn_escc = os.path.join(BASE_PATH_DATA, 'cansig_processed/escc.h5ad')
output_escc = os.path.join(BASE_PATH_DATA, f'preprocessed/pp_escc{appendix}.h5ad')
output_escc

In [None]:
escc_adata = sc.read_h5ad(fn_escc)

In [None]:
escc_adata.obs.malignant_key.value_counts()

In [None]:
escc_adata

In [None]:
escc_adata = preprocess_dataset(escc_adata, shift_method=norm_method, sample_based=sample_based)

In [None]:
escc_adata.write(output_escc)

In [None]:
del escc_adata, fn_escc, output_escc

##  CRC
The colorectal cancer dataset published by Pelka et al. [1] was first preprocessed with the CanSig pipeline [2]. We further genes (a gene needs to be expressed in at least 1% of the cells) and normalize the data with the shifted logarithm as described by Ahlmann-Eltze and Huber [3] and implemented in this post [4].  


[1] Karin Pelka, Matan Hofree, Jonathan H. Chen, Siranush Sarkizova, Joshua D. Pirl, Vjola Jorgji, Alborz Bejnood, Danielle Dionne, William H. Ge, Katherine H. Xu, Sherry X. Chao, Daniel R. Zollinger, David J. Lieb, Jason W. Reeves, Christopher A. Fuhrman, Margaret L. Hoang, Toni Delorey, Lan T. Nguyen, Julia Waldman, Max Klapholz, Isaac Wakiro, Ofir Cohen, Julian Albers, Christopher S. Smillie, Michael S. Cuoco, Jingyi Wu, Mei-ju Su, Jason Yeung, Brinda Vijaykumar, Angela M. Magnuson, Natasha Asinovski, Tabea Moll, Max N. Goder-Reiser, Anise S. Applebaum, Lauren K. Brais, Laura K. DelloStritto, Sarah L. Denning, Susannah T. Phillips, Emma K. Hill, Julia K. Meehan, Dennie T. Frederick, Tatyana Sharova, Abhay Kanodia, Ellen Z. Todres, Judit Jané-Valbuena, Moshe Biton, Benjamin Izar, Conner D. Lambden, Thomas E. Clancy, Ronald Bleday, Nelya Melnitchouk, Jennifer Irani, Hiroko Kunitake, David L. Berger, Amitabh Srivastava, Jason L. Hornick, Shuji Ogino, Asaf Rotem, Sébastien Vigneau, Bruce E. Johnson, Ryan B. Corcoran, Arlene H. Sharpe, Vijay K. Kuchroo, Kimmie Ng, Marios Giannakis, Linda T. Nieman, Genevieve M. Boland, Andrew J. Aguirre, Ana C. Anderson, Orit Rozenblatt-Rosen, Aviv Regev, Nir Hacohen,
Spatially organized multicellular immune hubs in human colorectal cancer, Cell, Volume 184, Issue 18, 2021, Pages 4734-4752.e20, ISSN 0092-8674, https://doi.org/10.1016/j.cell.2021.08.003 (https://www.sciencedirect.com/science/article/pii/S0092867421009454)

[2] CanSig: Discovering de novo shared transcriptional programs in single cancer cells
Josephine Yates, Florian Barkmann, Paweł Czyż, Marc Glettig, Frederieke Lohmann, Richard von der Horst, Elia Saquand, Nicolas Volken, Agnieszka Kraft, Valentina Boeva, bioRxiv 2022.04.14.488324; doi: https://doi.org/10.1101/2022.04.14.488324 

[3] Ahlmann-Eltze, C., Huber, W. Comparison of transformations for single-cell RNA-seq data. Nat Methods (2023). https://doi.org/10.1038/s41592-023-01814-1

[4] https://twitter.com/Sanbomics/status/1647654042749874177?s=20

In [None]:
fn_crc = os.path.join(BASE_PATH_DATA, 'cansig_processed/crc.h5ad')
output_crc = os.path.join(BASE_PATH_DATA, f'preprocessed/pp_crc{appendix}.h5ad')

In [None]:
crc_adata = sc.read_h5ad(fn_crc)

In [None]:
crc_adata.obs.malignant_key.value_counts()

In [None]:
crc_adata = preprocess_dataset(crc_adata, shift_method=norm_method, sample_based=sample_based)

In [None]:
crc_adata.write(output_crc)

In [None]:
del crc_adata, fn_crc, output_crc

##  Breast cancer
The breast cancer dataset published by Wu et al. [1] was first preprocessed with the CanSig pipeline [2]. We further genes (a gene needs to be expressed in at least 1% of the cells) and normalize the data with the shifted logarithm as described by Ahlmann-Eltze and Huber [3] and implemented in this post [4].   


[1] Wu, S.Z., Al-Eryani, G., Roden, D.L. et al. A single-cell and spatially resolved atlas of human breast cancers. Nat Genet 53, 1334–1347 (2021). https://doi.org/10.1038/s41588-021-00911-1

[2] CanSig: Discovering de novo shared transcriptional programs in single cancer cells
Josephine Yates, Florian Barkmann, Paweł Czyż, Marc Glettig, Frederieke Lohmann, Richard von der Horst, Elia Saquand, Nicolas Volken, Agnieszka Kraft, Valentina Boeva, bioRxiv 2022.04.14.488324; doi: https://doi.org/10.1101/2022.04.14.488324 

[3] Ahlmann-Eltze, C., Huber, W. Comparison of transformations for single-cell RNA-seq data. Nat Methods (2023). https://doi.org/10.1038/s41592-023-01814-1

[4] https://twitter.com/Sanbomics/status/1647654042749874177?s=20

In [None]:
fn_breast = os.path.join(BASE_PATH_DATA, 'cansig_processed/breast.h5ad')
output_breast = os.path.join(BASE_PATH_DATA, f'preprocessed/pp_breast{appendix}.h5ad')

In [None]:
breast_adata = sc.read_h5ad(fn_breast)

In [None]:
breast_adata.obs.malignant_key.value_counts()

In [None]:
breast_adata = preprocess_dataset(breast_adata, shift_method=norm_method, sample_based=sample_based)

In [None]:
breast_adata.write(output_breast)

In [None]:
del breast_adata, fn_breast, output_breast

##  LUAD
The lung adenocarcinoma dataset published by Kim et al. [1] was first preprocessed with the CanSig pipeline [2]. We further genes (a gene needs to be expressed in at least 1% of the cells) and normalize the data with the shifted logarithm as described by Ahlmann-Eltze and Huber [3] and implemented in this post [4].   


[1] Kim, N., Kim, H.K., Lee, K. et al. Single-cell RNA sequencing demonstrates the molecular and cellular reprogramming of metastatic lung adenocarcinoma. Nat Commun 11, 2285 (2020). https://doi.org/10.1038/s41467-020-16164-1

[2] CanSig: Discovering de novo shared transcriptional programs in single cancer cells
Josephine Yates, Florian Barkmann, Paweł Czyż, Marc Glettig, Frederieke Lohmann, Richard von der Horst, Elia Saquand, Nicolas Volken, Agnieszka Kraft, Valentina Boeva, bioRxiv 2022.04.14.488324; doi: https://doi.org/10.1101/2022.04.14.488324 

[3] Ahlmann-Eltze, C., Huber, W. Comparison of transformations for single-cell RNA-seq data. Nat Methods (2023). https://doi.org/10.1038/s41592-023-01814-1

[4] https://twitter.com/Sanbomics/status/1647654042749874177?s=20

In [None]:
fn_luad = os.path.join(BASE_PATH_DATA, 'cansig_processed/luad.h5ad')
output_luad = os.path.join(BASE_PATH_DATA, f'preprocessed/pp_luad{appendix}.h5ad')

In [None]:
luad_adata = sc.read_h5ad(fn_luad)

In [None]:
luad_adata.obs.malignant_key.value_counts()

In [None]:
luad_adata.obs.dataset.value_counts()

In [None]:
luad_adata = luad_adata[luad_adata.obs.dataset=='Kim_Lee_2020'].copy()

In [None]:
luad_adata = preprocess_dataset(luad_adata, shift_method=norm_method, sample_based=sample_based)

In [None]:
luad_adata.var.mito = luad_adata.var.mito.astype(bool)
luad_adata.var.feature_is_filtered = luad_adata.var.feature_is_filtered.astype(bool)

In [None]:
luad_adata.write(output_luad)

In [None]:
del luad_adata, fn_luad, output_luad