In [1]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc
import scipy.io
from scipy.sparse import vstack

## Lung

#### 1. Data_Bischoff2021_Lung

In [2]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Bischoff2021_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [3]:
adata

AnnData object with n_obs × n_vars = 120961 × 33514
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'source', 'technology', 'n_cells', 'patient_y', 'cancer_type', 'sample_type'

In [6]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score',
            'mp_top_score', 'mp_top', 'mp_assignment', 
            'patient_y', 'sample_type'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [14]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,cancer_type,study,category
0,AAACCCAAGACGACGT-1_1,p018_N,p018,,,519,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
1,AAACCCAAGCACTCTA-1_2,p018_N,p018,Macrophage,Alveolar_Macrophages1,618,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
2,AAACCCAAGCTGAAGC-1_3,p018_N,p018,Macrophage,Alveolar_Macrophages1,660,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
3,AAACCCACAACTTGGT-1_4,p018_N,p018,Macrophage,Alveolar_Macrophages1,843,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
4,AAACCCACAAGAATAC-1_5,p018_N,p018,Macrophage,Alveolar_Macrophages1,1060,Not cycling,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120956,TTTGGTTAGAGTGTTA-1_120957,p034_T,p034,Macrophage,CD14_Macrophages2,5310,G1/S,Tumor,10x,4480,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
120957,TTTGGTTGTCCCTCAT-1_120958,p034_T,p034,T_cell,T_CD8_1,1648,Not cycling,Tumor,10x,4480,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
120958,TTTGTTGCAGTAGAAT-1_120959,p034_T,p034,T_cell,T_CD8_1,1801,Not cycling,Tumor,10x,4480,Lung Adenocarcinoma,Bischoff2021_Lung,Lung
120959,TTTGTTGTCAAAGGAT-1_120960,p034_T,p034,T_cell,T_CD8_1,1499,Not cycling,Tumor,10x,4480,Lung Adenocarcinoma,Bischoff2021_Lung,Lung


In [10]:
adata.obs['study'] = 'Bischoff2021_Lung'

In [13]:
adata.obs['category'] = 'Lung'

In [15]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Bischoff2021_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Bischoff2021_Lung.h5ad


#### 2.Data_Chan2021_Lung

In [47]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Chan2021_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [48]:
adata

AnnData object with n_obs × n_vars = 86662 × 26036
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'source_x', 'treatment_x', 'procedure_x', 'disease', 'technology', 'n_cells', 'patient_y', 'cancer_type', 'sample_type', 'source_y', 'treatment_y', 'procedure_y', 'sample_primary_met', 'diagnosis_recurrence'

In [49]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'procedure_x', 'disease','diagnosis_recurrence',
            'sample_type', 'source_y', 'treatment_y', 'procedure_y', 'patient_y',
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [19]:
adata.obs['procedure_x'].value_counts()

procedure_x
Resection        46870
Biopsy           33881
Thoracentesis     5911
Name: count, dtype: int64

In [50]:
adata.obs

Unnamed: 0,cell_name,sample,patient_x,cell_type,cell_subtype,complexity,cell_cycle_phase,source_x,treatment_x,technology,n_cells,cancer_type,sample_primary_met
0,RU1215_192110488599350,RU1215,RU1215,Malignant,SCLC-N,3719,Not cycling,pleural_effusion,Naive,10x,3843,Small Cell Lung Cancer,Metastasis
1,RU1057_T_231270811326245,RU1057_T,RU1057,Endothelial,Endothelial,2293,Not cycling,lung,Naive,10x,2413,Lung Adenocarcinoma,Primary
2,RU1152_130751366121844,RU1152,RU1152,Malignant,SCLC-A,5093,Not cycling,LN,Naive,10x,1926,Small Cell Lung Cancer,Metastasis
3,PleuralEffusion_235007433161052,PleuralEffusion,PleuralEffusion,T_cell,T_cell,1011,Not cycling,pleural_effusion,"Platinum Doublet,Immunotherapy,TMZ,Other chemo...",10x,2068,Small Cell Lung Cancer,Metastasis
4,RU1128_169014021282219,RU1128,RU1128,T_cell,T_cell,1386,Not cycling,lung,Naive,10x,1373,Lung Adenocarcinoma,Primary
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86657,RU1128_126765648178078,RU1128,RU1128,Macrophage,Macrophage,2808,Not cycling,lung,Naive,10x,1373,Lung Adenocarcinoma,Primary
86658,RU1057_T_235559336667493,RU1057_T,RU1057,T_cell,T_cell,1062,Not cycling,lung,Naive,10x,2413,Lung Adenocarcinoma,Primary
86659,RU1108a_Bambanker_Frozen_231897696155998,RU1108a_Bambanker_Frozen,RU1108a,Malignant,SCLC-A,3532,Not cycling,lung,"Platinum Doublet,PARP inhibitor,TMZ",10x,4008,Small Cell Lung Cancer,Primary
86660,RU1138_235198540340019,RU1138,RU1138,T_cell,T_cell,1131,Not cycling,liver,"Platinum Doublet,Immunotherapy,TKI",10x,2260,Small Cell Lung Cancer,Metastasis


In [51]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [52]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [53]:
adata.obs = adata.obs.rename(columns={"treatment_x": "treated_naive"})

In [54]:
adata.obs['study'] = 'Chan2021_Lung'

In [55]:
adata.obs['category'] = 'Lung'

In [56]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Chan2021_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Chan2021_Lung.h5ad


#### 3.Data_Guo2018_Lung

In [57]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Guo2018_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [58]:
adata

AnnData object with n_obs × n_vars = 12346 × 22669
    obs: 'cell_name', 'sample', 'patient_x', 'source_x', 'sorting', 'cell_type', 'cell_subtype', 'cluster', 'complexity', 'patient_y', 'source_y', 'n_cells', 'cancer_type', 'technology', 'age', 'sex', 'stage', 'tnm_stage'

In [59]:
for col in ['sorting',  'stage', 'tnm_stage','patient_y', 'source_y','cluster',
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [60]:
adata.obs

Unnamed: 0,cell_name,sample,patient_x,source_x,cell_type,cell_subtype,complexity,n_cells,cancer_type,technology,age,sex
0,NTH10-0616A,P0616A_N,P0616A,Adjacent normal,T_cell,CD4+,3540,15,Lung Adenocarcinoma,SmartSeq2,74,Female
1,NTH11-0616A,P0616A_N,P0616A,Adjacent normal,T_cell,other,3710,15,Lung Adenocarcinoma,SmartSeq2,74,Female
2,NTH15-0616A,P0616A_N,P0616A,Adjacent normal,T_cell,double-negative,3707,15,Lung Adenocarcinoma,SmartSeq2,74,Female
3,NTH17-0616A,P0616A_N,P0616A,Adjacent normal,T_cell,CD4+,3427,15,Lung Adenocarcinoma,SmartSeq2,74,Female
4,NTH2-0616A,P0616A_N,P0616A,Adjacent normal,T_cell,CD4+,3482,15,Lung Adenocarcinoma,SmartSeq2,74,Female
...,...,...,...,...,...,...,...,...,...,...,...,...
12341,TTY63-20171219,P1219_T,P1219,Tumor,T_cell,double-negative,2471,128,Lung Adenocarcinoma,SmartSeq2,63,Female
12342,TTY65-20171219,P1219_T,P1219,Tumor,T_cell,CD4+,1373,128,Lung Adenocarcinoma,SmartSeq2,63,Female
12343,TTY7-20171219,P1219_T,P1219,Tumor,T_cell,CD4+,1682,128,Lung Adenocarcinoma,SmartSeq2,63,Female
12344,TTY8-20171219,P1219_T,P1219,Tumor,T_cell,CD4+,1725,128,Lung Adenocarcinoma,SmartSeq2,63,Female


In [61]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [62]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [63]:
adata.obs['category'] = 'Lung'

In [64]:
adata.obs['study'] = 'Guo2018_Lung'

In [65]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Guo2018_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Guo2018_Lung.h5ad


#### 4. Data_Ireland2020_Lung

In [78]:
# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Ireland2020_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read cell and sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Add cell-level metadata
adata.obs = cells

# Broadcast sample-level metadata to all cells
for col in samples.columns:
    if col != "Unnamed: 0":
        adata.obs[col] = samples.at[0, col]

# Optional: Clean memory
gc.collect()


0

In [83]:
adata

AnnData object with n_obs × n_vars = 2201 × 33538
    obs: 'cell_name', 'complexity', 'cell_cycle_phase', 'sample', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [81]:
adata.obs['technology'].value_counts()

technology
10X    2201
Name: count, dtype: int64

In [82]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score',
            'mp_top_score', 'mp_top', 'mp_assignment', 'source',
            'smoking_status', 'PY', 'diagnosis_recurrence', 
            'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage','size', 
           'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [84]:
adata.obs

Unnamed: 0,cell_name,complexity,cell_cycle_phase,sample,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,AAACCTGAGTCGTTTG-1,2077,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
1,AAACCTGGTCAACATC-1,1911,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
2,AAACCTGTCAAACGGG-1,6966,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
3,AAACGGGAGCGTAATA-1,1385,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
4,AAACGGGGTGGGTATG-1,6834,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,TTTGGTTTCGAGCCCA-1,507,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
2197,TTTGTCAAGGTTACCT-1,1942,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
2198,TTTGTCAAGTACACCT-1,7872,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated
2199,TTTGTCAGTCTTGTCC-1,1337,,Human_liver_SCLC_met,10X,2201,Human_liver_SCLC_met,SCLC,,,metastatic,met,liver,treated


In [85]:
adata.obs['category'] = 'Lung'

In [86]:
adata.obs['study'] = 'Ireland2020_Lung'

In [87]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Ireland2020_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Ireland2020_Lung.h5ad


#### 5.Kim2020_Lung

In [88]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Kim2020_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [90]:
adata

AnnData object with n_obs × n_vars = 32493 × 20793
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'source', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [91]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [92]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,AAACCTGAGACGACGT_LUNG_T34,P0034,B_cell,1287,Not cycling,tLung,10x,2704,P0034,Lung Adenocarcinoma,,,,primary,Lung,
1,AAACCTGAGACTGTAA_LUNG_T34,P0034,Malignant,5804,Not cycling,tLung,10x,2704,P0034,Lung Adenocarcinoma,,,,primary,Lung,
2,AAACCTGAGATGTCGG_LUNG_T31,P0031,Epithelial,1162,Not cycling,tLung,10x,1898,P0031,Lung Adenocarcinoma,,,,primary,Lung,
3,AAACCTGAGCCAGGAT_EBUS_28,P1028,Malignant,1728,Not cycling,tL/B,10x,4445,P1028,Lung Adenocarcinoma,,,metastatic,primary,Lung,
4,AAACCTGAGCGCTTAT_LUNG_T19,P0019,Macrophage,1509,Not cycling,tLung,10x,3127,P0019,Lung Adenocarcinoma,,,,primary,Lung,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32488,TTTGTCATCGGTGTTA_LUNG_T28,P0028,,1027,,tLung,10x,2642,P0028,Lung Adenocarcinoma,,,,primary,Lung,
32489,TTTGTCATCTGGGCCA_EBUS_06,P1006,,1128,,tL/B,10x,1264,P1006,Lung Adenocarcinoma,,,metastatic,primary,Lung,
32490,TTTGTCATCTTACCGC_EBUS_49,P1049,T_cell,1304,Not cycling,tL/B,10x,1458,P1049,Lung Adenocarcinoma,,,metastatic,primary,Lung,
32491,TTTGTCATCTTGCCGT_LUNG_T30,P0030,Malignant,2202,Not cycling,tLung,10x,2111,P0030,Lung Adenocarcinoma,,,,primary,Lung,


In [93]:
adata.obs['study'] = 'Kim2020_Lung'

In [94]:
adata.obs['category'] = 'Lung'

In [95]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Kim2020_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Kim2020_Lung.h5ad


#### 6.Laughney2020_Lung

In [96]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Laughney2020_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [99]:
adata

AnnData object with n_obs × n_vars = 40505 × 19222
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [98]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [104]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,category,study
0,0,RU653,T_cell,Tm,856,,RU653_TUMOR_1AS_UTR,10x,2381,RU653,Lung Adenocarcinoma,,,local,primary,Primary,NAIVE,Lung,Laughney2020_Lung
1,10,RU653,Macrophage,Macrophage,2732,Not cycling,RU653_TUMOR_1AS_UTR,10x,2381,RU653,Lung Adenocarcinoma,,,local,primary,Primary,NAIVE,Lung,Laughney2020_Lung
2,10000,RU653,Dendritic,Dendritic,2012,Not cycling,RU653_TUMOR_1AS_UTR,10x,2381,RU653,Lung Adenocarcinoma,,,local,primary,Primary,NAIVE,Lung,Laughney2020_Lung
3,10001,RU653,T_cell,Tm,646,,RU653_TUMOR_1AS_UTR,10x,2381,RU653,Lung Adenocarcinoma,,,local,primary,Primary,NAIVE,Lung,Laughney2020_Lung
4,10002,RU653,T_cell,Tm,1025,Not cycling,RU653_TUMOR_1AS_UTR,10x,2381,RU653,Lung Adenocarcinoma,,,local,primary,Primary,NAIVE,Lung,Laughney2020_Lung
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40500,9994,RU701,T_cell,Tm,950,,RU701_BRAINMET_4,10x,2546,RU701,Lung Adenocarcinoma,,,metastatic,met,M(Brain),TREATED,Lung,Laughney2020_Lung
40501,9995,RU701,T_cell,Tm,540,,RU701_BRAINMET_4,10x,2546,RU701,Lung Adenocarcinoma,,,metastatic,met,M(Brain),TREATED,Lung,Laughney2020_Lung
40502,9996,RU701,T_cell,Tm,1267,Not cycling,RU701_BRAINMET_4,10x,2546,RU701,Lung Adenocarcinoma,,,metastatic,met,M(Brain),TREATED,Lung,Laughney2020_Lung
40503,9997,RU701,T_cell,Tm,900,,RU701_BRAINMET_4,10x,2546,RU701,Lung Adenocarcinoma,,,metastatic,met,M(Brain),TREATED,Lung,Laughney2020_Lung


In [101]:
adata.obs['category'] = 'Lung'

In [102]:
adata.obs['study'] = 'Laughney2020_Lung'

In [103]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Laughney2020_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Laughney2020_Lung.h5ad


#### 7.Maynard2020_Lung

In [105]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Maynard2020_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [115]:
adata

AnnData object with n_obs × n_vars = 19777 × 26577
    obs: 'cell_name', 'sample', 'patient', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'n_cells', 'technology', 'cancer_type', 'sex', 'age', 'sample_primary_met', 'site', 'disease_extent', 'treated_naive'

In [114]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67','source_y',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch','time_rx_to_sampling',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [111]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [112]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [116]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,n_cells,technology,cancer_type,sex,age,sample_primary_met,site,disease_extent,treated_naive
0,A1_1001000292,LT_S03,TH156,Dendritic,Dendritic,2648,Not cycling,tumor,193,SmartSeq2,Lung Adenocarcinoma,Male,43.0,Metastatic,Pleura,metastatic,treated
1,A1_1001000295,LT_S03,TH156,Dendritic,Dendritic,1406,Not cycling,tumor,193,SmartSeq2,Lung Adenocarcinoma,Male,43.0,Metastatic,Pleura,metastatic,treated
2,A1_1001000315,LT_S07,TH103,Macrophage,Macrophage,2270,Not cycling,tumor,137,SmartSeq2,Lung Adenocarcinoma,Female,45.0,Metastatic,Pleura,metastatic,treated
3,A1_1001000376,LT_S14,TH153,NK_cell,NK_cell,1834,Not cycling,tumor,62,SmartSeq2,Lung Adenocarcinoma,Female,63.0,Metastatic,Liver,metastatic,treated
4,A1_B000278,LT_S41,TH210,Macrophage,Macrophage,2064,Not cycling,tumor,61,SmartSeq2,Lung Adenocarcinoma,Male,56.0,Metastatic,Adrenal,metastatic,treated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19772,P9_B003658,LT_S50,TH225,Plasma,Plasma,3779,Not cycling,tumor,348,SmartSeq2,Lung Adenocarcinoma,Female,77.0,Metastatic,LN,metastatic,naive
19773,P9_B003771,LT_S71,TH236,Plasma,Plasma,2364,Not cycling,tumor,1007,SmartSeq2,Lung Adenocarcinoma,Female,58.0,Primary,Lung,metastatic,treated
19774,P9_B003894,LT_S71,TH236,Mast,Mast,2286,Not cycling,tumor,1007,SmartSeq2,Lung Adenocarcinoma,Female,58.0,Primary,Lung,metastatic,treated
19775,P9_B003920,LT_S71,TH236,Mast,Mast,2807,Not cycling,tumor,1007,SmartSeq2,Lung Adenocarcinoma,Female,58.0,Primary,Lung,metastatic,treated


In [117]:
adata.obs['category'] = 'Lung'

In [118]:
adata.obs['study'] = 'Maynard2020_Lung'

In [119]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Maynard2020_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Maynard2020_Lung.h5ad


#### 8.Qian2020_Lung

In [120]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Qian2020_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [123]:
adata

AnnData object with n_obs × n_vars = 27262 × 22276
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'disease', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [125]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'disease',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67','source_y',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch','time_rx_to_sampling',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [126]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,BT1238_AACTTGCTACTCTT,1,Malignant,Malignant,2266,Not cycling,core,10x,465,LC_1,Lung Squamous Cell Carcinoma,F,70-75,local,P,,naive
1,BT1238_AAGCCTGATTCCAT,1,T_cell,T_cell,2437,G1/S,core,10x,465,LC_1,Lung Squamous Cell Carcinoma,F,70-75,local,P,,naive
2,BT1238_ACAATAACCGATAC,1,Mast,Mast,1047,Not cycling,core,10x,465,LC_1,Lung Squamous Cell Carcinoma,F,70-75,local,P,,naive
3,BT1238_ACCCTCGATTGGTG,1,T_cell,T_cell,2067,G1/S,core,10x,465,LC_1,Lung Squamous Cell Carcinoma,F,70-75,local,P,,naive
4,BT1238_ACGAAGCTTGGGAG,1,Macrophage,Macrophage,1087,Not cycling,core,10x,465,LC_1,Lung Squamous Cell Carcinoma,F,70-75,local,P,,naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27257,scrBT1432_TTTATGCCATTGTGCA,8,T_cell,T_cell,2365,Not cycling,core,10x,4868,LC_8,Lung Pleiomorphic Carcinoma,F,50-55,local,P,,naive
27258,scrBT1432_TTTCCTCCACTTGGAT,8,T_cell,T_cell,1185,Not cycling,core,10x,4868,LC_8,Lung Pleiomorphic Carcinoma,F,50-55,local,P,,naive
27259,scrBT1432_TTTCCTCGTTCGGGCT,8,T_cell,T_cell,1012,Not cycling,core,10x,4868,LC_8,Lung Pleiomorphic Carcinoma,F,50-55,local,P,,naive
27260,scrBT1432_TTTGCGCTCTTCAACT,8,T_cell,T_cell,1600,Not cycling,core,10x,4868,LC_8,Lung Pleiomorphic Carcinoma,F,50-55,local,P,,naive


In [127]:
adata.obs['category'] = 'Lung'

In [128]:
adata.obs['study'] = 'Qian2020_Lung'

In [129]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Qian2020_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Qian2020_Lung.h5ad


#### 9.Song2019_Lung

In [130]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Song2019_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [136]:
adata

AnnData object with n_obs × n_vars = 8772 × 10147
    obs: 'cell_name', 'sample', 'patient', 'source', 'cell_type', 'cell_subtype', 'complexity', 'n_cells', 'technology', 'cancer_type', 'sex'

In [146]:
for col in ['sample_id_x', 'sample_id_y', 'patient_y', 'source_y','tnm_stage','race', 'smoking_status'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [134]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [135]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [142]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cell_type,cell_subtype,complexity,n_cells,technology,cancer_type,sex,site,category,study
0,Tumor_5258_AAACCTGGTACAGACG-1,P1_Tumor,P1,tumor,Monocyte,CD14+ Monocyte,1224,1832,10x,Lung Adenocarcinoma,male,Lung,Lung,Song2019_Lung
1,Tumor_5258_AAACGGGGTAGCGCTC-1,P1_Tumor,P1,tumor,NK_cell,NK,2456,1832,10x,Lung Adenocarcinoma,male,Lung,Lung,Song2019_Lung
2,Tumor_5258_AAACGGGGTCCTCTTG-1,P1_Tumor,P1,tumor,T_cell,T-cell,726,1832,10x,Lung Adenocarcinoma,male,Lung,Lung,Song2019_Lung
3,Tumor_5258_AAACGGGTCCAAACAC-1,P1_Tumor,P1,tumor,T_cell,Th,986,1832,10x,Lung Adenocarcinoma,male,Lung,Lung,Song2019_Lung
4,Tumor_5258_AAACGGGTCTTTAGTC-1,P1_Tumor,P1,tumor,Monocyte,CD14+ Monocyte,1590,1832,10x,Lung Adenocarcinoma,male,Lung,Lung,Song2019_Lung
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8767,Normal_5489_TTTGGTTTCTGCGTAA-1,P4_Normal,P4,adjacent normal,T_cell,T-cell,430,2027,10x,Lung Adenocarcinoma,female,Lung,Lung,Song2019_Lung
8768,Normal_5489_TTTGTCACAAGTCATC-1,P4_Normal,P4,adjacent normal,T_cell,CD8+ T-cell,779,2027,10x,Lung Adenocarcinoma,female,Lung,Lung,Song2019_Lung
8769,Normal_5489_TTTGTCAGTATGAATG-1,P4_Normal,P4,adjacent normal,T_cell,Th,1115,2027,10x,Lung Adenocarcinoma,female,Lung,Lung,Song2019_Lung
8770,Normal_5489_TTTGTCATCACTGGGC-1,P4_Normal,P4,adjacent normal,NK_cell,NK,706,2027,10x,Lung Adenocarcinoma,female,Lung,Lung,Song2019_Lung


In [139]:
adata.obs['site'] = 'Lung'

In [140]:
adata.obs['category'] = 'Lung'

In [141]:
adata.obs['study'] = 'Song2019_Lung'

In [143]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Song2019_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Song2019_Lung.h5ad


#### 10.Xing2021_Lung

In [144]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Xing2021_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [150]:
adata

AnnData object with n_obs × n_vars = 118293 × 17541
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [149]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'disease',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67','source_y',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed','cell_subtype_clusters',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch','time_rx_to_sampling',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [156]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,category,study
0,NM3N_nLung_AAACCTGCACGTCTCT,NM3N,T_cell,CD4+_Tcells,884,,Normal_Lung,10x,3835,,,,,,,,,Lung,Xing2021_Lung
1,NM3N_nLung_AAACCTGCAGCTGCTG,NM3N,T_cell,CD4+_Tcells,948,,Normal_Lung,10x,3835,,,,,,,,,Lung,Xing2021_Lung
2,NM3N_nLung_AAACCTGCAGTACACT,NM3N,T_cell,CD4+_Tcells,1232,Not cycling,Normal_Lung,10x,3835,,,,,,,,,Lung,Xing2021_Lung
3,NM3N_nLung_AAACCTGGTAAGCACG,NM3N,T_cell,CD8+_Tcells,1097,Not cycling,Normal_Lung,10x,3835,,,,,,,,,Lung,Xing2021_Lung
4,NM3N_nLung_AAACCTGGTCCCTTGT,NM3N,T_cell,CD8+_Tcells,761,,Normal_Lung,10x,3835,,,,,,,,,Lung,Xing2021_Lung
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118288,NM6E_mLUAD_TACTTACCAAGTAGTA,NM6E,Malignant,Malignant,1589,G2/M,Primary_LUAD_with_Met,10x,1700,,,,,,,,,Lung,Xing2021_Lung
118289,NM6E_mLUAD_TAGTTGGGTCCTGCTT,NM6E,Malignant,Malignant,2131,G2/M,Primary_LUAD_with_Met,10x,1700,,,,,,,,,Lung,Xing2021_Lung
118290,NM6E_mLUAD_TCAGCTCCAGTGGGAT,NM6E,Epithelial,EPCAM+_cells,377,,Primary_LUAD_with_Met,10x,1700,,,,,,,,,Lung,Xing2021_Lung
118291,NM6E_mLUAD_TGGGCGTCAGGTCCAC,NM6E,Malignant,Malignant,1703,G2/M,Primary_LUAD_with_Met,10x,1700,,,,,,,,,Lung,Xing2021_Lung


In [152]:
adata.obs['site'].value_counts()

site
LU    46924
RU    13705
RM     5394
RL     4438
Name: count, dtype: int64

In [153]:
adata.obs['category'] = 'Lung'

In [154]:
adata.obs['study'] = 'Xing2021_Lung'

In [155]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Xing2021_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Xing2021_Lung.h5ad


#### 11.Zilionis2019_Lung

In [157]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Lung/Data_Zilionis2019_Lung"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [161]:
adata

AnnData object with n_obs × n_vars = 31179 × 31797
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'age', 'gender', 'cancer_type', 'technology', 'n_cells'

In [160]:
for col in [ 'umap1', 'umap2', 'g1s_score', 'g2m_score',
             'mp_top_score', 'mp_top', 'mp_assignment',
              'tnm_stage','Age', 'Gender', 'Stage', 'TNMstage'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [162]:
adata.obs = adata.obs.rename(columns={"gender": "sex"})

In [163]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,age,sex,cancer_type,technology,n_cells
0,p1t3_bcBNQJ,p1,B_cell,B,621,,Tumor,79,Male,Lung Squamous Cell Carcinoma,inDrop,9508
1,p1t3_bcDBBH,p1,B_cell,B,540,,Tumor,79,Male,Lung Squamous Cell Carcinoma,inDrop,9508
2,p1t3_bcAKOO,p1,B_cell,B,1790,Not cycling,Tumor,79,Male,Lung Squamous Cell Carcinoma,inDrop,9508
3,p1t3_bcFHAR,p1,B_cell,B,1737,Not cycling,Tumor,79,Male,Lung Squamous Cell Carcinoma,inDrop,9508
4,p1t3_bcAPCM,p1,B_cell,B,857,,Tumor,79,Male,Lung Squamous Cell Carcinoma,inDrop,9508
...,...,...,...,...,...,...,...,...,...,...,...,...
31174,p5t2_bcHHQY,p5,Plasma,Plasma,820,,Tumor,72,Female,Lung Adenocarcinoma,inDrop,2197
31175,p5t2_bcGRWT,p5,Plasma,Plasma,785,,Tumor,72,Female,Lung Adenocarcinoma,inDrop,2197
31176,p5t2_bcEZCR,p5,Plasma,Plasma,861,,Tumor,72,Female,Lung Adenocarcinoma,inDrop,2197
31177,p5t2_bcHLFS,p5,Plasma,Plasma,1153,Not cycling,Tumor,72,Female,Lung Adenocarcinoma,inDrop,2197


In [164]:
adata.obs['category'] = 'Lung'

In [165]:
adata.obs['study'] = 'Zilionis2019_Lung'

In [166]:
output_path = "/home/ubuntu/Downloads/Data_Lung/Data_Zilionis2019_Lung.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Lung/Data_Zilionis2019_Lung.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [2]:
# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Lung/Data_Bischoff2021_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Chan2021_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Guo2018_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Ireland2020_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Kim2020_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Laughney2020_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Maynard2020_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Qian2020_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Song2019_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Xing2021_Lung.h5ad",
    "/home/ubuntu/Downloads/Data_Lung/Data_Zilionis2019_Lung.h5ad"
]

gc.collect()
# Load datasets
adatas = [sc.read(file) for file in files]

gc.collect()
# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

gc.collect()
# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

gc.collect()


  concat_annot = pd.concat(
  utils.warn_names_duplicates("obs")


0

In [3]:
adata_merged

AnnData object with n_obs × n_vars = 500451 × 50504
    obs: 'cell_name', 'sample', 'patient', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'technology', 'n_cells', 'cancer_type', 'study', 'category', 'treated_naive', 'sample_primary_met', 'age', 'sex', 'disease_extent', 'site'

In [4]:
adata_merged.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,cancer_type,study,category,treated_naive,sample_primary_met,age,sex,disease_extent,site
0,AAACCCAAGACGACGT-1_1,p018_N,p018,,,519,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung,,,,,,
1,AAACCCAAGCACTCTA-1_2,p018_N,p018,Macrophage,Alveolar_Macrophages1,618,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung,,,,,,
2,AAACCCAAGCTGAAGC-1_3,p018_N,p018,Macrophage,Alveolar_Macrophages1,660,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung,,,,,,
3,AAACCCACAACTTGGT-1_4,p018_N,p018,Macrophage,Alveolar_Macrophages1,843,,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung,,,,,,
4,AAACCCACAAGAATAC-1_5,p018_N,p018,Macrophage,Alveolar_Macrophages1,1060,Not cycling,Normal,10x,11173,Lung Adenocarcinoma,Bischoff2021_Lung,Lung,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31174,p5t2_bcHHQY,p5,,Plasma,Plasma,820,,Tumor,inDrop,2197,Lung Adenocarcinoma,Zilionis2019_Lung,Lung,,,72,Female,,
31175,p5t2_bcGRWT,p5,,Plasma,Plasma,785,,Tumor,inDrop,2197,Lung Adenocarcinoma,Zilionis2019_Lung,Lung,,,72,Female,,
31176,p5t2_bcEZCR,p5,,Plasma,Plasma,861,,Tumor,inDrop,2197,Lung Adenocarcinoma,Zilionis2019_Lung,Lung,,,72,Female,,
31177,p5t2_bcHLFS,p5,,Plasma,Plasma,1153,Not cycling,Tumor,inDrop,2197,Lung Adenocarcinoma,Zilionis2019_Lung,Lung,,,72,Female,,


In [5]:
# Optional: Save to disk
adata_merged.write("/home/ubuntu/Downloads/Data_Lung/Lung_Combined.h5ad")
print("Merged dataset saved.")

Merged dataset saved.


In [6]:
adata_merged.obs['study'].value_counts()

study
Bischoff2021_Lung    120961
Xing2021_Lung        118293
Chan2021_Lung         86662
Laughney2020_Lung     40505
Kim2020_Lung          32493
Zilionis2019_Lung     31179
Qian2020_Lung         27262
Maynard2020_Lung      19777
Guo2018_Lung          12346
Song2019_Lung          8772
Ireland2020_Lung       2201
Name: count, dtype: int64