In [None]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np



  from .autonotebook import tqdm as notebook_tqdm


### Dataset Preparation: Lung Cancer Cells from Chan et al., 2021
This notebook processes a single-cell RNA-seq dataset from Chan et al., 2021. The goal is to extract malignant cells from primary tumor samples and reformat metadata to match the structure used in other datasets for downstream integration.

https://www.cell.com/cancer-cell/fulltext/S1535-6108(21)00497-9#sec-4-5-6-2

In [None]:
# Load UMI count matrix in sparse format (default COO)
matrix = scipy.io.mmread("./Data_Chan2021_Lung/Exp_data_UMIcounts.mtx")

# Load gene list
genes = open("./Data_Chan2021_Lung/Genes.txt").read().splitlines()

# Load cell-level metadata
cells_metadata = pd.read_csv("./Data_Chan2021_Lung/Cells.csv", index_col=0)

# Load sample-level metadata
samples_metadata = pd.read_csv("./Data_Chan2021_Lung/Samples.csv", index_col=0)


In [None]:
# Create AnnData object
adata = sc.AnnData(X=matrix.T)

# Add gene names
adata.var_names = genes

# Add cell metadata
adata.obs = cells_metadata

# Merge sample metadata based on the "sample" column
adata.obs = adata.obs.merge(samples_metadata, how="left", on="sample")




 Форматы разреженных матриц в `scipy.sparse` (такие как `COO`, `CSR` и `CSC`) имеют разные преимущества и используются для разных задач. Вот краткое объяснение, когда и какой формат лучше использовать:

- **Для фильтрации данных** (например, выбор подмножества клеток или генов) лучше использовать **CSR** (если фильтрация идет по строкам) или **CSC** (если фильтрация идет по столбцам).
- **Для вычислений** (например, PCA, кластеризация или другие матричные операции) также лучше использовать **CSR** или **CSC**, так как они оптимизированы для таких задач.
- **Для создания или загрузки данных** можно использовать **COO**, а затем преобразовать в **CSR** или **CSC** для дальнейшей работы.


Мы загрузили данные в формате COO при сборке объекта, но надо перевести в CSR

In [None]:
# Convert sparse matrix to CSR format for compatibility and efficiency
adata.X = adata.X.tocsr()
print(type(adata.X))

<class 'scipy.sparse._csr.csr_matrix'>


In [None]:
# Check for normalization or log-transformation (values should be raw)
print(adata.X.min(), adata.X.max())

0 17013


We intentionally skip normalization and log-transformation at this stage,
because raw counts are required for integration with other datasets.
Since gene sets differ between datasets, it's preferable to merge them first
and perform normalization afterward, once the gene space is unified.

In [24]:
adata.obs.head()

Unnamed: 0,sample,patient_x,cell_type,cell_subtype,complexity,umap1,umap2,g1s_score,g2m_score,cell_cycle_phase,...,technology,n_cells,patient_y,cancer_type,sample_type,source_y,treatment_y,procedure_y,sample_primary_met,diagnosis_recurrence
0,RU1215,RU1215,Malignant,SCLC-N,3719,-21.2406,15.0462,0.0188,-0.043,Not cycling,...,10x,3843,RU1215,Small Cell Lung Cancer,Pleural Fluid,pleural_effusion,Naive,Thoracentesis,Metastasis,
1,RU1057_T,RU1057,Endothelial,Endothelial,2293,9.8559,26.6865,0.0323,0.0548,Not cycling,...,10x,2413,RU1057,Lung Adenocarcinoma,Tumor,lung,Naive,Resection,Primary,
2,RU1152,RU1152,Malignant,SCLC-A,5093,-7.9765,-34.9137,0.3126,0.1707,Not cycling,...,10x,1926,RU1152,Small Cell Lung Cancer,Lymph Node,LN,Naive,Biopsy,Metastasis,
3,PleuralEffusion,PleuralEffusion,T_cell,T_cell,1011,15.4183,-11.1212,0.1049,-0.0144,Not cycling,...,10x,2068,PleuralEffusion,Small Cell Lung Cancer,Pleural Fluid,pleural_effusion,"Platinum Doublet,Immunotherapy,TMZ,Other chemo...",Thoracentesis,Metastasis,
4,RU1128,RU1128,T_cell,T_cell,1386,15.3358,-1.6981,-0.0452,0.0633,Not cycling,...,10x,1373,RU1128,Lung Adenocarcinoma,Tumor,lung,Naive,Resection,Primary,


Check cell types and sample types

In [None]:
adata.obs["cell_subtype"].unique()

array(['SCLC-N', 'Endothelial', 'SCLC-A', 'T_cell', 'SCLC-P', 'Basal',
       'Macrophage', 'Dendritic', 'Ciliated', 'Ionocyte', 'Mast',
       'B_cell', 'Fibroblast', 'AE1', 'AEP', 'Neuroendocrine',
       'Hepatocyte', 'Plasma', 'NSCLC', 'Mucinous', 'Club', 'Neutrophil',
       'Tuft'], dtype=object)

In [26]:
adata.obs["cell_type"].unique()

array(['Malignant', 'Endothelial', 'T_cell', 'Epithelial', 'Macrophage',
       'Dendritic', 'Mast', 'B_cell', 'Fibroblast', 'Plasma',
       'Neutrophil'], dtype=object)

In [None]:
adata.obs["sample_type"].unique()

array(['Pleural Fluid', 'Tumor', 'Lymph Node'], dtype=object)

In [None]:
# Select only malignant cells from primary tumor samples
adata_tumor_cells = adata[(adata.obs["cell_type"] == "Malignant") & (adata.obs["sample_type"] == "Tumor")]

In [30]:
adata_tumor_cells.obs["cell_type"].unique()

array(['Malignant'], dtype=object)

In [31]:
adata_tumor_cells.obs["cell_subtype"].unique() 

# все, остались только нужные нам клетки

array(['SCLC-N', 'SCLC-A', 'NSCLC', 'SCLC-P'], dtype=object)

In [32]:
adata_tumor_cells.obs.head()

Unnamed: 0,sample,patient_x,cell_type,cell_subtype,complexity,umap1,umap2,g1s_score,g2m_score,cell_cycle_phase,...,technology,n_cells,patient_y,cancer_type,sample_type,source_y,treatment_y,procedure_y,sample_primary_met,diagnosis_recurrence
7,RU1181B,RU1181,Malignant,SCLC-N,7295,-12.8784,-20.7759,1.3639,0.0213,G1/S,...,10x,1871,RU1181B,Small Cell Lung Cancer,Tumor,lung,Platinum Doublet,Biopsy,Primary,
8,RU1108a_RPMI,RU1108a,Malignant,SCLC-A,4009,-1.6692,-7.1275,-0.1177,0.0123,Not cycling,...,10x,3852,RU1108a,Small Cell Lung Cancer,Tumor,lung,"Platinum Doublet,PARP inhibitor,TMZ",Resection,Primary,Recurrence
16,RU1145,RU1145,Malignant,SCLC-A,5168,-26.7587,-14.5602,1.147,1.9605,G2/M,...,10x,3866,RU1145,Small Cell Lung Cancer,Tumor,lung,Naive,Resection,Primary,
21,RU1108a_Bambanker,RU1108a,Malignant,SCLC-A,3349,-4.1274,-3.9164,-0.0899,0.0263,Not cycling,...,10x,2773,RU1108a,Small Cell Lung Cancer,Tumor,lung,"Platinum Doublet,PARP inhibitor,TMZ",Resection,Primary,Recurrence
23,RU1066,RU1066,Malignant,SCLC-A,3780,-9.8286,-11.7148,1.8767,0.1344,G1/S,...,10x,3121,RU1066,Small Cell Lung Cancer,Tumor,lung,Naive,Resection,Primary,


In [33]:
adata_tumor_cells.obs["sample_type"].unique()

array(['Tumor'], dtype=object)

In [None]:
# Select required columns for standard metadata
required_columns = ['sample', 'patient_x', 'cell_subtype', "cancer_type"]
adata_tumor_cells.obs = adata_tumor_cells.obs[required_columns]

AnnData expects .obs.index to contain strings, but got values like:
    [7, 8, 16, 21, 23]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [None]:
# Construct standardized metadata structure
obs_df = pd.DataFrame(adata_tumor_cells.obs)

obs_df['Dataset'] = 'LUNG_SCLC_LUAD_Chan_2021'
obs_df['Organ_origin'] = 'Lung cancer'
obs_df['Sample'] = obs_df['sample']
obs_df['Patient'] = obs_df['patient_x']
obs_df['Tissue'] = "Tumor"
obs_df['Cancer type'] = obs_df['cancer_type']
obs_df['cnv_status'] = 'tumor'
obs_df['Celltype'] = obs_df['cell_subtype']

In [37]:
adata_tumor_cells.obs = obs_df

AnnData expects .obs.index to contain strings, but got values like:
    [7, 8, 16, 21, 23]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [None]:
# Reorder columns
desired_order = ['Dataset', 'Organ_origin', 'Sample', 'Patient', 'Tissue', 'Cancer type', 'cnv_status', 'Celltype']
adata_tumor_cells.obs = adata_tumor_cells.obs[desired_order]
adata_tumor_cells.obs

AnnData expects .obs.index to contain strings, but got values like:
    [7, 8, 16, 21, 23]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


Unnamed: 0,Dataset,Organ_origin,Sample,Patient,Tissue,Cancer type,cnv_status,Celltype
7,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1181B,RU1181,Tumor,Small Cell Lung Cancer,tumor,SCLC-N
8,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1108a_RPMI,RU1108a,Tumor,Small Cell Lung Cancer,tumor,SCLC-A
16,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1145,RU1145,Tumor,Small Cell Lung Cancer,tumor,SCLC-A
21,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1108a_Bambanker,RU1108a,Tumor,Small Cell Lung Cancer,tumor,SCLC-A
23,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1066,RU1066,Tumor,Small Cell Lung Cancer,tumor,SCLC-A
...,...,...,...,...,...,...,...,...
86641,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1108a_Bambanker,RU1108a,Tumor,Small Cell Lung Cancer,tumor,SCLC-A
86643,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1108a_Bambanker_Frozen,RU1108a,Tumor,Small Cell Lung Cancer,tumor,SCLC-A
86653,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1293A,RU1293A,Tumor,Small Cell Lung Cancer,tumor,SCLC-N
86659,LUNG_SCLC_LUAD_Chan_2021,Lung cancer,RU1108a_Bambanker_Frozen,RU1108a,Tumor,Small Cell Lung Cancer,tumor,SCLC-A


In [None]:
# Clear unnecessary AnnData layers
adata_tumor_cells.uns.clear()
adata_tumor_cells

AnnData object with n_obs × n_vars = 32591 × 26036
    obs: 'Dataset', 'Organ_origin', 'Sample', 'Patient', 'Tissue', 'Cancer type', 'cnv_status', 'Celltype'

In [None]:
# Save processed dataset to .h5ad
adata_tumor_cells.write("LUNG_SCLC_LUAD_Chan_2021_filtered.h5ad")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:/