In [None]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import harmonypy as hm

### Dataset Preparation: LUAD and LUSC Tumor Cells from GSE148071
This notebook processes a single-cell RNA-seq dataset of lung tumor cells from GSE148071. The goal is to extract LUAD and LUSC cells, clean up the metadata, harmonize it with other datasets, and save the result in a standardized format for integration.


In [None]:
adata_lung = sc.read_h5ad("/home/patskanivan/data/GSE148071_tumor_cells_cancer_type_annotated_240425.h5ad")
adata_lung.obs["dataset"] = "LUNG_cancer_GSE148071"



##### Check normalization status by inspecting value range
To understand the preprocessing status of the dataset:
1. Print the minimum and maximum values in `adata.X`
2. Attempt to reverse the log-normalization and evaluate total expression per cell

In [3]:
print(adata_lung.X.min(), adata_lung.X.max())

0.0 8.814539


In [4]:
adata_lung.shape

(50099, 29527)

Now we need to retain only those columns in adata_lung.obs that are required to structure the dataset in a format consistent with other datasets.
It is essential to keep the following fields: donor_id, sample, cell_type, and study (used as a batch indicator).

In [7]:
adata_lung

AnnData object with n_obs × n_vars = 50099 × 29527
    obs: 'Sample', 'Dataset', 'leiden', 'celltypist_majority_label', 'cnv_leiden', 'cnv_score', 'cnv_score_highlight', 'cnv_score_binary', 'cnv_status', 'Cancer_type', 'dataset'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'gene_name', 'chromosome', 'start', 'end'
    uns: 'Cancer_type_colors', 'Sample_colors', 'celltypist_majority_label_colors', 'cnv', 'cnv_leiden', 'cnv_leiden_colors', 'cnv_neighbors', 'cnv_score_binary_colors', 'cnv_status_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_cnv', 'X_cnv_pca', 'X_cnv_umap', 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    obsp: 'cnv_neighbors_connectivities', 'cnv_neighbors_distances', 'connectivities', 'distances'

In [8]:
adata_lung.obs[["Sample", "Dataset", 'Cancer_type', 'dataset']]

Unnamed: 0,Sample,Dataset,Cancer_type,dataset
0,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
1,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
2,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
3,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
4,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
...,...,...,...,...
50094,GSM4453617_P42,GSE148071,NSCLC,LUNG_cancer_GSE148071
50095,GSM4453617_P42,GSE148071,NSCLC,LUNG_cancer_GSE148071
50096,GSM4453617_P42,GSE148071,NSCLC,LUNG_cancer_GSE148071
50097,GSM4453617_P42,GSE148071,NSCLC,LUNG_cancer_GSE148071


In [None]:
adata_lung.obs = adata_lung.obs[["Sample", "Dataset", "Cancer_type", "dataset"]]

In [None]:
# Filter to include only LUAD and LUSC cancer types
adata_lung = adata_lung[(adata_lung.obs["Cancer_type"] == "LUAD") | (adata_lung.obs["Cancer_type"] == "LUSC")].copy()

In [None]:
print(adata_lung.obs["Cancer_type"].value_counts())
print(adata_lung.obs.head())

Cancer_type
LUSC    39190
LUAD    10748
Name: count, dtype: int64
          Sample    Dataset Cancer_type                dataset
0  GSM4453576_P1  GSE148071        LUSC  LUNG_cancer_GSE148071
1  GSM4453576_P1  GSE148071        LUSC  LUNG_cancer_GSE148071
2  GSM4453576_P1  GSE148071        LUSC  LUNG_cancer_GSE148071
3  GSM4453576_P1  GSE148071        LUSC  LUNG_cancer_GSE148071
4  GSM4453576_P1  GSE148071        LUSC  LUNG_cancer_GSE148071


In [None]:
# Unique sample
adata_lung.obs["Sample"].unique() # 40

['GSM4453576_P1', 'GSM4453577_P2', 'GSM4453578_P3', 'GSM4453579_P4', 'GSM4453580_P5', ..., 'GSM4453612_P37', 'GSM4453613_P38', 'GSM4453614_P39', 'GSM4453615_P40', 'GSM4453616_P41']
Length: 40
Categories (40, object): ['GSM4453576_P1', 'GSM4453577_P2', 'GSM4453578_P3', 'GSM4453579_P4', ..., 'GSM4453613_P38', 'GSM4453614_P39', 'GSM4453615_P40', 'GSM4453616_P41']

In [14]:
adata_lung.obs

Unnamed: 0,Sample,Dataset,Cancer_type,dataset
0,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
1,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
2,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
3,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
4,GSM4453576_P1,GSE148071,LUSC,LUNG_cancer_GSE148071
...,...,...,...,...
50025,GSM4453616_P41,GSE148071,LUSC,LUNG_cancer_GSE148071
50026,GSM4453616_P41,GSE148071,LUSC,LUNG_cancer_GSE148071
50027,GSM4453616_P41,GSE148071,LUSC,LUNG_cancer_GSE148071
50028,GSM4453616_P41,GSE148071,LUSC,LUNG_cancer_GSE148071


In [None]:
obs_df = pd.DataFrame(adata_lung.obs)

obs_df['Dataset'] = 'LUNG_GSE148071_LUAD_LUSC_filtered_epithelial'
obs_df['Organ_origin'] = 'Lung'
obs_df['Sample'] = obs_df['Sample']
obs_df['Patient'] = obs_df['Sample']
obs_df['Tissue'] = "Tumor"
obs_df['Cancer type'] = 'Lung Cancer'
obs_df['cnv_status'] = 'tumor'
obs_df['Celltype'] = obs_df['Cancer_type']

In [16]:
adata_lung.obs = obs_df

In [None]:
adata_lung.obs = adata_lung.obs[["Sample", "Dataset", "Cancer type", "Organ_origin", "Patient", "Tissue", "cnv_status", "Celltype"]]
adata_lung.obs

Unnamed: 0,Sample,Dataset,Cancer type,Organ_origin,Patient,Tissue,cnv_status,Celltype
0,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
1,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
2,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
3,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
4,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
...,...,...,...,...,...,...,...,...
50025,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC
50026,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC
50027,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC
50028,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC


In [18]:
adata_lung.obs

Unnamed: 0,Sample,Dataset,Cancer type,Organ_origin,Patient,Tissue,cnv_status,Celltype
0,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
1,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
2,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
3,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
4,GSM4453576_P1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453576_P1,Tumor,tumor,LUSC
...,...,...,...,...,...,...,...,...
50025,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC
50026,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC
50027,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC
50028,GSM4453616_P41,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung Cancer,Lung,GSM4453616_P41,Tumor,tumor,LUSC


In [None]:
# # Reorder metadata columns
desired_order = ['Dataset', 'Organ_origin', 'Sample', 'Patient', 'Tissue', 'Cancer type', 'cnv_status', 'Celltype']
adata_lung.obs = adata_lung.obs[desired_order]
adata_lung.obs

Unnamed: 0,Dataset,Organ_origin,Sample,Patient,Tissue,Cancer type,cnv_status,Celltype
0,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453576_P1,GSM4453576_P1,Tumor,Lung Cancer,tumor,LUSC
1,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453576_P1,GSM4453576_P1,Tumor,Lung Cancer,tumor,LUSC
2,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453576_P1,GSM4453576_P1,Tumor,Lung Cancer,tumor,LUSC
3,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453576_P1,GSM4453576_P1,Tumor,Lung Cancer,tumor,LUSC
4,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453576_P1,GSM4453576_P1,Tumor,Lung Cancer,tumor,LUSC
...,...,...,...,...,...,...,...,...
50025,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453616_P41,GSM4453616_P41,Tumor,Lung Cancer,tumor,LUSC
50026,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453616_P41,GSM4453616_P41,Tumor,Lung Cancer,tumor,LUSC
50027,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453616_P41,GSM4453616_P41,Tumor,Lung Cancer,tumor,LUSC
50028,LUNG_GSE148071_LUAD_LUSC_filtered_epithelial,Lung,GSM4453616_P41,GSM4453616_P41,Tumor,Lung Cancer,tumor,LUSC


In [20]:
adata_lung

AnnData object with n_obs × n_vars = 49938 × 29527
    obs: 'Dataset', 'Organ_origin', 'Sample', 'Patient', 'Tissue', 'Cancer type', 'cnv_status', 'Celltype'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'gene_name', 'chromosome', 'start', 'end'
    uns: 'Cancer_type_colors', 'Sample_colors', 'celltypist_majority_label_colors', 'cnv', 'cnv_leiden', 'cnv_leiden_colors', 'cnv_neighbors', 'cnv_score_binary_colors', 'cnv_status_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_cnv', 'X_cnv_pca', 'X_cnv_umap', 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    obsp: 'cnv_neighbors_connectivities', 'cnv_neighbors_distances', 'connectivities', 'distances'

In [None]:
#  Clear unnecessary layers
adata_lung.obsm.clear()
adata_lung.obsp.clear()
adata_lung.varm.clear()
adata_lung.uns.clear()

In [22]:
adata_lung.var = adata_lung.var[[]]

In [23]:
adata_lung

AnnData object with n_obs × n_vars = 49938 × 29527
    obs: 'Dataset', 'Organ_origin', 'Sample', 'Patient', 'Tissue', 'Cancer type', 'cnv_status', 'Celltype'

In [24]:
adata_lung.shape

(49938, 29527)

In [25]:
print(adata_lung.X.min(), adata_lung.X.max())

0.0 8.814539


In [None]:
# Reverse log-normalization (log1p -> expm1)
adata_lung.X = np.expm1(adata_lung.X)
print(adata_lung.X.min(), adata_lung.X.max())

0.0 6730.404


In [None]:
#  Save dataset
adata_lung.write("../data/LUNG_GSE148071_LUAD_LUSC_ready_to_merge.h5ad")