### This notebook is just for subsetting the ovarian and skin datasets for malignant cells

In [None]:
from pathlib import Path

import pandas as pd
import scanpy as sc

from constants import BASE_PATH_RAW_CANCER, BASE_PATH_DATA, BASE_PATH_ANNOT_CANCER, BASE_PATH_CANSIG_PP_CANCER
from scipy import sparse

In [None]:
SAVE = True

#### Ovarian cancer dataset
Vázquez-García, I., Uhlitz, F., Ceglia, N. et al. Ovarian cancer mutational processes drive site-specific immune evasion. Nature 612, 778–786 (2022). https://doi.org/10.1038/s41586-022-05496-1

In [None]:
adata = sc.read_h5ad(Path(BASE_PATH_RAW_CANCER) / 'OV_Vazquez_10X.h5ad')
adata.shape

In [None]:
## Filter only for cancer cells
adata = adata[adata.obs['cell_type'] == 'Ovarian.cancer.cell']
adata.shape

In [None]:
adata.obs['cluster_label'].value_counts(dropna=False).sort_index()

In [None]:
adata = adata[~adata.obs['cluster_label'].isna()]
adata.shape

In [None]:
adata = adata[~adata.obs['cluster_label'].str.startswith('Ciliated.cell')].copy()
adata.shape

In [None]:
adata.X = sparse.csr_matrix(adata.X)

In [None]:
if SAVE:
    adata.write(Path(BASE_PATH_RAW_CANCER) / 'ovarian_malignant.h5ad')

#### Ji et al. 2020 skin dataset
Ji AL, Rubin AJ, Thrane K, Jiang S, Reynolds DL, Meyers RM, Guo MG, George BM, Mollbrink A, Bergenstråhle J, Larsson L, Bai Y, Zhu B, Bhaduri A, Meyers JM, Rovira-Clavé X, Hollmig ST, Aasi SZ, Nolan GP, Lundeberg J, Khavari PA. Multimodal Analysis of Composition and Spatial Architecture in Human Squamous Cell Carcinoma. Cell. 2020 Jul 23;182(2):497-514.e22. doi: 10.1016/j.cell.2020.05.039. Epub 2020 Jun 23. Erratum in: Cell. 2020 Sep 17;182(6):1661-1662. doi: 10.1016/j.cell.2020.08.043. PMID: 32579974; PMCID: PMC7391009.

In [None]:
def load_skin():
    adata = sc.read(
        Path(BASE_PATH_RAW_CANCER) / 'Ji_et_al_2020' / 'GSE144236_cSCC_counts.txt',
        delimiter='\t')

    metadata = pd.read_table(
        Path(BASE_PATH_RAW_CANCER) / 'Ji_et_al_2020' / 'GSE144236_patient_metadata_new.txt',
        delimiter='\t')

    adata = adata.transpose()
    adata.obs[metadata.columns.to_list()] = metadata.copy()
    adata = adata[:, 2:].copy()
    return adata

In [None]:
adata = load_skin()

In [None]:
adata = adata[adata.obs['level2_celltype'].isin(['TSK', 'Tumor_KC_Basal', 'Tumor_KC_Cyc', 'Tumor_KC_Diff'])].copy()
adata.shape

In [None]:
adata.X = sparse.csr_matrix(adata.X)

In [None]:
if SAVE:
    adata.write(Path(BASE_PATH_RAW_CANCER) / 'skin_malignant.h5ad')

#### Kim et al. lung dataset
Kim, N., Kim, H.K., Lee, K. et al. Single-cell RNA sequencing demonstrates the molecular and cellular reprogramming of metastatic lung adenocarcinoma. Nat Commun 11, 2285 (2020). https://doi.org/10.1038/s41467-020-16164-1

In [None]:
adata = sc.read_h5ad(Path(BASE_PATH_DATA) / 'data_from_florian' / 'data' / 'kim_lung.h5ad')

In [None]:
samples_in_adata = adata.obs.index.to_list()

In [None]:
cell_labels = pd.read_table(Path(BASE_PATH_ANNOT_CANCER) / 'luad_kim' / "GSE131907_Lung_Cancer_cell_annotation.txt")
cell_labels['Index'] = cell_labels['Index'].str.replace('_', '-')
cell_labels = cell_labels.set_index('Index')
cell_labels = cell_labels.loc[samples_in_adata]

In [None]:
y_true_col = 'Cell_subtype'
sample_col = 'sample'

In [None]:
adata.obs[y_true_col] = cell_labels[y_true_col].str.lower()

In [None]:
adata = adata[adata.obs[adata.obs[y_true_col].str.startswith('ts', na=False)].index].copy()

In [None]:
adata.obs[y_true_col].value_counts()

In [None]:
if SAVE:
    adata.write(Path(BASE_PATH_CANSIG_PP_CANCER) / 'luad_kim_malignant.h5ad')