In [1]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc
import scipy.io
from scipy.sparse import vstack

## Breast

#### 1. Azizi2018_Breast

In [33]:
import os
import pandas as pd
import scanpy as sc

# === Set base path ===
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Azizi2018_Breast"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)
cells_10x.index = cells_10x.index.astype(str)  # Ensure string index

adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs_names = cells_10x.index  # Set obs_names to ensure match
adata_10x.obs['technology'] = '10X'

# === Load InDrop Data ===
path_ss2 = os.path.join(base_path, "InDrop")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_UMIcounts.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)
cells_ss2.index = cells_ss2.index.astype(str)

adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs_names = cells_ss2.index
adata_ss2.obs['technology'] = 'InDrop'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate datasets ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'InDrop'],
    index_unique=None
)

# === Check for unique cell names ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve original index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]  # This is the original cell ID column

# Merge metadata by 'sample' column
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')

# Check merge consistency
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore cell IDs as index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index = adata_combined.obs.index.astype(str)  # Ensure string index
adata_combined.obs.index.name = None

# === Final AnnData object ===
print(adata_combined)


  adata_combined = adata_10x.concatenate(


AnnData object with n_obs × n_vars = 75029 × 11253
    obs: 'sample', 'patient_x', 'source_x', 'cell_type', 'cell_subtype', 'cluster', 'complexity', 'technology_x', 'cluster_annotation', 'batch', 'patient_y', 'source_y', 'n_cells', 'technology_y', 'sorting', 'cancer_type', 'sample_type', 'size', 'metastases', 'grade', 'er', 'pr', 'her2', 'post_menopause', 'age', 'subtype', 'brca_deficiency'


In [34]:
adata = adata_combined

In [35]:
for col in ['cluster', 'batch', 'patient_y', 'source_y', 'cluster_annotation', 'subtype', 'sample_type',
            'technology_y', 'sorting','size', 'grade', 'er', 'pr', 'her2', 'post_menopause','brca_deficiency'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [42]:
adata.obs

Unnamed: 0,sample,patient,source,cell_type,cell_subtype,complexity,technology,n_cells,cancer_type,metastases,age,cell_name,disease_extent
s1_AAACCTGAGCAGACTG-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1684,10X,6710,Breast Cancer,0,65,s1_AAACCTGAGCAGACTG-1,non metastatic
s1_AAACCTGAGGTCGGAT-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1653,10X,6710,Breast Cancer,0,65,s1_AAACCTGAGGTCGGAT-1,non metastatic
s1_AAACCTGAGTGTACCT-1,BC09_TUMOR1,BC09,breast tumor,T_cell,T-reg,1326,10X,6710,Breast Cancer,0,65,s1_AAACCTGAGTGTACCT-1,non metastatic
s1_AAACCTGAGTGTACTC-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1463,10X,6710,Breast Cancer,0,65,s1_AAACCTGAGTGTACTC-1,non metastatic
s1_AAACCTGAGTTAAGTG-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD4 T cell,1288,10X,6710,Breast Cancer,0,65,s1_AAACCTGAGTTAAGTG-1,non metastatic
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56541,BC01_TUMOR4,BC01,breast tumor,T_cell,CD8 T cell,224,InDrop,1191,Breast Cancer,0,38,56541,non metastatic
56630,BC01_TUMOR4,BC01,breast tumor,T_cell,CD8 T cell,336,InDrop,1191,Breast Cancer,0,38,56630,non metastatic
57031,BC01_TUMOR4,BC01,breast tumor,T_cell,CD8 T cell,347,InDrop,1191,Breast Cancer,0,38,57031,non metastatic
56235,BC01_TUMOR4,BC01,breast tumor,T_cell,CD4 T cell,349,InDrop,1191,Breast Cancer,0,38,56235,non metastatic


In [37]:
adata.obs['cell_name'] = adata.obs_names

In [38]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [39]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [40]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [19]:
adata.obs['metastases'].value_counts()

metastases
0    47077
1    27952
Name: count, dtype: int64

In [41]:
adata.obs['disease_extent'] = adata.obs['metastases'].astype(str).map({
    '0': 'non metastatic',
    '1': 'metastatic',
})

In [43]:
del adata.obs['metastases']

In [44]:
adata.obs

Unnamed: 0,sample,patient,source,cell_type,cell_subtype,complexity,technology,n_cells,cancer_type,age,cell_name,disease_extent
s1_AAACCTGAGCAGACTG-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1684,10X,6710,Breast Cancer,65,s1_AAACCTGAGCAGACTG-1,non metastatic
s1_AAACCTGAGGTCGGAT-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1653,10X,6710,Breast Cancer,65,s1_AAACCTGAGGTCGGAT-1,non metastatic
s1_AAACCTGAGTGTACCT-1,BC09_TUMOR1,BC09,breast tumor,T_cell,T-reg,1326,10X,6710,Breast Cancer,65,s1_AAACCTGAGTGTACCT-1,non metastatic
s1_AAACCTGAGTGTACTC-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1463,10X,6710,Breast Cancer,65,s1_AAACCTGAGTGTACTC-1,non metastatic
s1_AAACCTGAGTTAAGTG-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD4 T cell,1288,10X,6710,Breast Cancer,65,s1_AAACCTGAGTTAAGTG-1,non metastatic
...,...,...,...,...,...,...,...,...,...,...,...,...
56541,BC01_TUMOR4,BC01,breast tumor,T_cell,CD8 T cell,224,InDrop,1191,Breast Cancer,38,56541,non metastatic
56630,BC01_TUMOR4,BC01,breast tumor,T_cell,CD8 T cell,336,InDrop,1191,Breast Cancer,38,56630,non metastatic
57031,BC01_TUMOR4,BC01,breast tumor,T_cell,CD8 T cell,347,InDrop,1191,Breast Cancer,38,57031,non metastatic
56235,BC01_TUMOR4,BC01,breast tumor,T_cell,CD4 T cell,349,InDrop,1191,Breast Cancer,38,56235,non metastatic


In [45]:
adata.obs['study'] = 'Azizi2018_Breast'

In [46]:
adata.obs['category'] = 'Breast'

In [47]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Azizi2018_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Azizi2018_Breast.h5ad


#### 2.Bassez2021_Breast

In [48]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Bassez2021_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [53]:
adata

AnnData object with n_obs × n_vars = 226635 × 22567
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'expansion', 'BC_type', 'cohort', 'technology', 'n_cells', 'patient', 'cancer_type', 'sample_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'treated_naive', 'site'

In [60]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'timepoint', 'nCount_RNA', 'nFeature_RNA',
            'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'recent_treatment', 'recent_treatment_response', 'time_elapsed_from_recent_treatment',
            'prior_chemotherapy', 'chemotherapy_response', 'prior_targeted_rx', 'targeted_rx_response',
            'prior_chemoICB', 'chemoICB_response', 'prior_ET', 'ET_response', 'subsequent_treatment',
            'subsequent_treatment_response', 'PFS_DFS', 'OS', 'expansion', 'BC_type', 'cohort', 'sample_type',
            'mutation_hormonal_subtype', 'prior_ICB', 'ICB_response'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [61]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,treated_naive,site,study,category
0,BIOKEY_1_On_AAACCTGGTAGCAAAT-1,BIOKEY_1,T_cell,CD4_EM,430,,10X,9789,BIOKEY_1,Breast Cancer,False,41-50,early,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
1,BIOKEY_1_On_AAACCTGGTCATACTG-1,BIOKEY_1,Malignant,,700,,10X,9789,BIOKEY_1,Breast Cancer,False,41-50,early,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
2,BIOKEY_1_On_AAACCTGGTCTAGGTT-1,BIOKEY_1,Fibroblast,,330,,10X,9789,BIOKEY_1,Breast Cancer,False,41-50,early,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
3,BIOKEY_1_On_AAACCTGGTTGGACCC-1,BIOKEY_1,Fibroblast,,2637,Not cycling,10X,9789,BIOKEY_1,Breast Cancer,False,41-50,early,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
4,BIOKEY_1_On_AAACCTGTCACGATGT-1,BIOKEY_1,Malignant,,874,,10X,9789,BIOKEY_1,Breast Cancer,False,41-50,early,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226630,BIOKEY_9_Pre_TTTGTCACAAACGTGG-1,BIOKEY_9,B_cell,,246,,10X,4076,BIOKEY_9,Breast Cancer,False,41-50,locally advanced,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
226631,BIOKEY_9_Pre_TTTGTCACAGTACACT-1,BIOKEY_9,Fibroblast,,1447,Not cycling,10X,4076,BIOKEY_9,Breast Cancer,False,41-50,locally advanced,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
226632,BIOKEY_9_Pre_TTTGTCAGTCGGATCC-1,BIOKEY_9,T_cell,,2131,Not cycling,10X,4076,BIOKEY_9,Breast Cancer,False,41-50,locally advanced,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast
226633,BIOKEY_9_Pre_TTTGTCAGTTCCGTCT-1,BIOKEY_9,Malignant,,3148,Not cycling,10X,4076,BIOKEY_9,Breast Cancer,False,41-50,locally advanced,primary,"sample """"""""pre"""""""" - prior to ICB treatment \n...",Breast,Bassez2021_Breast,Breast


In [57]:
adata.obs['study'] = 'Bassez2021_Breast'

In [58]:
adata.obs['category'] = 'Breast'

In [62]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Bassez2021_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Bassez2021_Breast.h5ad


#### 3.Chung2017_Breast

In [64]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Chung2017_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect




<function gc.collect(generation=2)>

In [67]:
adata

AnnData object with n_obs × n_vars = 515 × 57915
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [66]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [68]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,BC01_02,BC01,Malignant,9172,Not cycling,SmartSeq2,22,BC01,Breast Cancer,False,66,local,primary,Breast,naive
1,BC01_03,BC01,Malignant,9507,Intermediate,SmartSeq2,22,BC01,Breast Cancer,False,66,local,primary,Breast,naive
2,BC01_04,BC01,Malignant,7457,Not cycling,SmartSeq2,22,BC01,Breast Cancer,False,66,local,primary,Breast,naive
3,BC01_05,BC01,Malignant,8245,Not cycling,SmartSeq2,22,BC01,Breast Cancer,False,66,local,primary,Breast,naive
4,BC01_06,BC01,Malignant,7815,Not cycling,SmartSeq2,22,BC01,Breast Cancer,False,66,local,primary,Breast,naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,BC06_43,BC06,Stromal,4473,,SmartSeq2,18,BC06,Breast Cancer,False,67,node positive,primary,Breast,naive
511,BC06_58,BC06,B_cell,2261,Not cycling,SmartSeq2,18,BC06,Breast Cancer,False,67,node positive,primary,Breast,naive
512,BC06_60,BC06,Malignant,7311,Not cycling,SmartSeq2,18,BC06,Breast Cancer,False,67,node positive,primary,Breast,naive
513,BC06_61,BC06,B_cell,2583,Not cycling,SmartSeq2,18,BC06,Breast Cancer,False,67,node positive,primary,Breast,naive


In [69]:
adata.obs['category'] = 'Breast'

In [70]:
adata.obs['study'] = 'Chung2017_Breast'

In [71]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Chung2017_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Chung2017_Breast.h5ad


#### 4. Gulati2020_Breast

In [72]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Gulati2020_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [79]:
adata

AnnData object with n_obs × n_vars = 1902 × 42132
    obs: 'cell_name', 'sample', 'patient', 'source', 'cell_type', 'cell_subtype', 'complexity', 'cancer_type', 'technology', 'n_cells', 'sex'

In [74]:
for col in ['source_y', 'clinical_type', 'clinical_subtype','stage', 'hormone_receptor_status', 'grade','patient_y'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [76]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [77]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [78]:
adata.obs = adata.obs.rename(columns={"gender": "sex"})

In [86]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cell_type,cell_subtype,complexity,cancer_type,technology,n_cells,sex,category,study,site
0,A10_004_SU2_N,SU2_N,SU2,Normal,Epithelial,Basal,4986,Breast Cancer,SmartSeq2,75,Female,Breast,Gulati2020_Breast,Breast
1,A1_004_SU2_N,SU2_N,SU2,Normal,Epithelial,Mature luminal,2314,Breast Cancer,SmartSeq2,75,Female,Breast,Gulati2020_Breast,Breast
2,A1_005_SU2_T,SU2_T,SU2,Tumor,Malignant,Luminal progenitor,1859,Breast Cancer,SmartSeq2,80,Female,Breast,Gulati2020_Breast,Breast
3,A11_004_SU2_N,SU2_N,SU2,Normal,Epithelial,Mature luminal,2906,Breast Cancer,SmartSeq2,75,Female,Breast,Gulati2020_Breast,Breast
4,A12_004_SU2_N,SU2_N,SU2,Normal,Epithelial,Mature luminal,4906,Breast Cancer,SmartSeq2,75,Female,Breast,Gulati2020_Breast,Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1897,G5_061_COH69_T,COH69_T,COH69,Tumor,Malignant,Luminal progenitor,1854,Breast Cancer,SmartSeq2,83,Female,Breast,Gulati2020_Breast,Breast
1898,G6_061_COH69_T,COH69_T,COH69,Tumor,Malignant,Luminal progenitor,4756,Breast Cancer,SmartSeq2,83,Female,Breast,Gulati2020_Breast,Breast
1899,G7_062_COH69_T,COH69_T,COH69,Tumor,Malignant,Luminal progenitor,1755,Breast Cancer,SmartSeq2,83,Female,Breast,Gulati2020_Breast,Breast
1900,G8_062_COH69_T,COH69_T,COH69,Tumor,Malignant,Luminal progenitor,1541,Breast Cancer,SmartSeq2,83,Female,Breast,Gulati2020_Breast,Breast


In [85]:
adata.obs['site'] = 'Breast'

In [81]:
adata.obs['category'] = 'Breast'

In [82]:
adata.obs['study'] = 'Gulati2020_Breast'

In [87]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Gulati2020_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Gulati2020_Breast.h5ad


#### 5.Karaayvas2018_Breast

In [88]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Karaayvas2018_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [91]:
adata

AnnData object with n_obs × n_vars = 1534 × 21785
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [90]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [96]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,category,study
0,PT089_P1_A01,89,Epithelial,3556,Not cycling,SmartSeq2,333,89,Breast Cancer,False,44,,primary,breast,naive,Breast,Karaayvas2018_Breast
1,PT089_P1_A02,89,Epithelial,3372,Not cycling,SmartSeq2,333,89,Breast Cancer,False,44,,primary,breast,naive,Breast,Karaayvas2018_Breast
2,PT089_P1_A03,89,Epithelial,2577,Not cycling,SmartSeq2,333,89,Breast Cancer,False,44,,primary,breast,naive,Breast,Karaayvas2018_Breast
3,PT089_P1_A04,89,Macrophage,2260,Not cycling,SmartSeq2,333,89,Breast Cancer,False,44,,primary,breast,naive,Breast,Karaayvas2018_Breast
4,PT089_P1_A05,89,Macrophage,2658,Not cycling,SmartSeq2,333,89,Breast Cancer,False,44,,primary,breast,naive,Breast,Karaayvas2018_Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1529,PT039_P10_H08_S284,39,Macrophage,3564,Not cycling,SmartSeq2,341,39,Breast Cancer,False,64,,primary,breast,naive,Breast,Karaayvas2018_Breast
1530,PT039_P10_H09_S285,39,Malignant,6862,G1/S,SmartSeq2,341,39,Breast Cancer,False,64,,primary,breast,naive,Breast,Karaayvas2018_Breast
1531,PT039_P10_H10_S286,39,,518,,SmartSeq2,341,39,Breast Cancer,False,64,,primary,breast,naive,Breast,Karaayvas2018_Breast
1532,PT039_P10_H11_S287,39,Epithelial,992,,SmartSeq2,341,39,Breast Cancer,False,64,,primary,breast,naive,Breast,Karaayvas2018_Breast


In [94]:
adata.obs['study'] = 'Karaayvas2018_Breast'

In [93]:
adata.obs['category'] = 'Breast'

In [95]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Karaayvas2018_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Karaayvas2018_Breast.h5ad


#### 6.Kim2018_Breast

In [97]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Kim2018_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [105]:
adata

AnnData object with n_obs × n_vars = 2472 × 12472
    obs: 'cell_name', 'sample', 'patient', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'treated_naive', 'age', 'cancer_type', 'cancer_subtype', 'technology', 'n_cells'

In [99]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY',  'time_point','histology_x',
            'tnm_stage_x', 'grade_x', 'patient_y',
            'treatment_y', 'histology_y', 'age_y', 'tnm_stage_y', 'grade_y'      
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [102]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [103]:
adata.obs = adata.obs.rename(columns={"treatment_x": "treated_naive"})

In [104]:
adata.obs = adata.obs.rename(columns={"age_x": "age"})

In [110]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,treated_naive,age,cancer_type,cancer_subtype,technology,n_cells,category,study
0,KTN1290cell435,P2_pre,P2,T_cell,CD8Tex,625,,,46,Breast Cancer,TNBC,Nanogrid,98,Breast,Kim2018_Breast
1,KTN1290cell234,P2_pre,P2,T_cell,CD8Tex,514,,,46,Breast Cancer,TNBC,Nanogrid,98,Breast,Kim2018_Breast
2,KTN1290cell110,P2_pre,P2,T_cell,CD8Tex,706,,,46,Breast Cancer,TNBC,Nanogrid,98,Breast,Kim2018_Breast
3,KTN1290cell147,P2_pre,P2,T_cell,CD8Tex,632,,,46,Breast Cancer,TNBC,Nanogrid,98,Breast,Kim2018_Breast
4,KTN1290cell106,P2_pre,P2,T_cell,CD8Tex,664,,,46,Breast Cancer,TNBC,Nanogrid,98,Breast,Kim2018_Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2467,KTN1322cell101,P12_mid,P12,Malignant,Malignant,527,,2 cycles epirubicin + docetaxel,40,Breast Cancer,TNBC,Nanogrid,164,Breast,Kim2018_Breast
2468,KTN1520cell416,P14_pre,P14,Malignant,Malignant,1971,,,38,Breast Cancer,TNBC,Nanogrid,474,Breast,Kim2018_Breast
2469,KTN1260cell172,P1_pre,P1,Malignant,Malignant,555,,,44,Breast Cancer,TNBC,Nanogrid,141,Breast,Kim2018_Breast
2470,KTN152OPcell19,P14_post,P14,Malignant,Malignant,1184,,"2 cycles epirubicin + docetaxel, then 4 cycles...",38,Breast Cancer,TNBC,Nanogrid,373,Breast,Kim2018_Breast


In [107]:
adata.obs['category'] = 'Breast'

In [108]:
adata.obs['study'] = 'Kim2018_Breast'

In [109]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Kim2018_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Kim2018_Breast.h5ad


#### 7.Qian2020_Breast

In [111]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Qian2020_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [114]:
adata

AnnData object with n_obs × n_vars = 16537 × 22276
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [113]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [119]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,category,study
0,sc5rJUQ026_AAACCTGCATCTCGCT,42,T_cell,2138,Not cycling,10x,1805,BC_2,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
1,sc5rJUQ026_AAACCTGGTTCTCATT,42,T_cell,2085,Not cycling,10x,1805,BC_2,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
2,sc5rJUQ026_AAACCTGTCAAGAAGT,42,T_cell,1767,Not cycling,10x,1805,BC_2,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
3,sc5rJUQ026_AAACCTGTCTCGCATC,42,T_cell,2016,Not cycling,10x,1805,BC_2,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
4,sc5rJUQ026_AAACGGGAGTTATCGC,42,T_cell,1595,Not cycling,10x,1805,BC_2,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16532,sc5rJUQ064_TTTGGTTTCTGCCCTA,54,Malignant,2324,Not cycling,10x,3717,BC_14,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
16533,sc5rJUQ064_TTTGTCAAGCCAGAAC,54,Malignant,2964,Not cycling,10x,3717,BC_14,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
16534,sc5rJUQ064_TTTGTCAAGGACGAAA,54,Malignant,1814,Not cycling,10x,3717,BC_14,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast
16535,sc5rJUQ064_TTTGTCAGTCTTGTCC,54,Malignant,2464,Not cycling,10x,3717,BC_14,Breast Cancer,Female,,,,,naive,Breast,Qian2020_Breast


In [116]:
adata.obs['category'] = 'Breast'

In [117]:
adata.obs['study'] = 'Qian2020_Breast'

In [118]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Qian2020_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Qian2020_Breast.h5ad


#### 8.Savas2018_Breast

In [120]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Savas2018_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [121]:
adata

AnnData object with n_obs × n_vars = 6311 × 24410
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'patient_y', 'cancer_type', 'technology', 'n_cells', 'subtype', 'sorting', 'sample_primary_met'

In [124]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score','sorting', 
            
            'mp_top_score', 'mp_top', 'mp_assignment', 'patient_y','subtype',
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [129]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,complexity,cell_cycle_phase,cancer_type,technology,n_cells,sample_primary_met,site
0,AAACCTGCAAACAACA-1_1,1,1,T_cell,828,,Breast Cancer,10x,5174,primary,breast
1,AAACCTGCACTACAGT-1_1,1,1,T_cell,1126,Not cycling,Breast Cancer,10x,5174,primary,breast
2,AAACCTGCAGCTGGCT-1_1,1,1,T_cell,712,,Breast Cancer,10x,5174,primary,breast
3,AAACCTGCATGTCGAT-1_1,1,1,,491,,Breast Cancer,10x,5174,primary,breast
4,AAACCTGGTGAGCGAT-1_1,1,1,T_cell,663,,Breast Cancer,10x,5174,primary,breast
...,...,...,...,...,...,...,...,...,...,...,...
6306,TTTCCTCAGCTACCTA-1_2,2,2,T_cell,1434,Not cycling,Breast Cancer,10x,1137,primary,breast
6307,TTTCCTCTCCCAAGTA-1_2,2,2,T_cell,1545,Not cycling,Breast Cancer,10x,1137,primary,breast
6308,TTTGCGCAGAGGTAGA-1_2,2,2,T_cell,1247,Not cycling,Breast Cancer,10x,1137,primary,breast
6309,TTTGCGCAGGAATGGA-1_2,2,2,T_cell,1070,Not cycling,Breast Cancer,10x,1137,primary,breast


In [128]:
adata.obs['site'] = 'breast'

In [130]:
adata.obs['category'] = 'Breast'

In [131]:
adata.obs['study'] = 'Savas2018_Breast'

In [132]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Savas2018_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Savas2018_Breast.h5ad


#### 9.Wu2021_Breast

In [133]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Wu2021_Breast"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

  cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))


<function gc.collect(generation=2)>

In [136]:
adata

AnnData object with n_obs × n_vars = 100064 × 29733
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'celltype_subset', 'disease', 'gender_x', 'age_x', 'cancer_type_x', 'treatment_x', 'technology', 'n_cells'

In [143]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score',
            'mp_top_score', 'mp_top', 'mp_assignment','grade_x', 'celltype_subset', 'disease',
            'ER_x', 'PR_x', 'HER2_IHC_x', 'HER2_ISH_ratio_x', 'Ki67_x', 'subtype_IHC_x',
            'details_treatment_x', 'notable_pathological_features_x', 'stage_x', 'cancer_type_y',
            'patient_y', 'gender_y', 'age_y', 'grade_y', 'cancer_type.1', 'ER_y', 'PR_y', 'HER2_IHC_y', 'HER2_ISH_ratio_y',
            'Ki67_y', 'subtype_IHC_y', 'treatment_y', 'details_treatment_y', 'notable_pathological_features_y', 'stage_y', 'histology'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [148]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,sex,age,cancer_type,treated_naive,technology,n_cells,site,study
0,CID3586_AAGACCTCAGCATGAG,CID3586,CID3586,Endothelial,Endothelial ACKR1,1689,Not cycling,Female,43.0,IDC,Naïve,10x,6178,breast,Wu2021_Breast
1,CID3586_AAGGTTCGTAGTACCT,CID3586,CID3586,Endothelial,Endothelial ACKR1,779,,Female,43.0,IDC,Naïve,10x,6178,breast,Wu2021_Breast
2,CID3586_ACCAGTAGTTGTGGCC,CID3586,CID3586,Endothelial,Endothelial ACKR1,514,,Female,43.0,IDC,Naïve,10x,6178,breast,Wu2021_Breast
3,CID3586_ACCCACTAGATGTCGG,CID3586,CID3586,Endothelial,Endothelial ACKR1,609,,Female,43.0,IDC,Naïve,10x,6178,breast,Wu2021_Breast
4,CID3586_ACTGATGGTCAACTGT,CID3586,CID3586,Endothelial,Endothelial ACKR1,807,,Female,43.0,IDC,Naïve,10x,6178,breast,Wu2021_Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100059,CID4398_TCAGGTAGTACTCAAC,CID4398,CID4398,Dendritic,DCs,1251,Not cycling,Female,52.0,IDC,Treated,10x,4451,breast,Wu2021_Breast
100060,CID4398_TCTATTGTCGCCATAA,CID4398,CID4398,Dendritic,DCs,719,,Female,52.0,IDC,Treated,10x,4451,breast,Wu2021_Breast
100061,CID4398_TCTTTCCCAGTAAGCG,CID4398,CID4398,Dendritic,DCs,887,,Female,52.0,IDC,Treated,10x,4451,breast,Wu2021_Breast
100062,CID4398_TGCCCATGTTACGGAG,CID4398,CID4398,Dendritic,DCs,870,,Female,52.0,IDC,Treated,10x,4451,breast,Wu2021_Breast


In [138]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [139]:
adata.obs = adata.obs.rename(columns={"gender_x": "sex"})

In [140]:
adata.obs = adata.obs.rename(columns={"cancer_type_x": "cancer_type"})

In [141]:
adata.obs = adata.obs.rename(columns={"age_x": "age"})

In [142]:
adata.obs = adata.obs.rename(columns={"treatment_x": "treated_naive"})

In [145]:
adata.obs['site'] = 'breast'

In [159]:
adata.obs['category'] = 'Breast'

In [146]:
adata.obs['study'] = 'Wu2021_Breast'

In [147]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Wu2021_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Wu2021_Breast.h5ad


#### 10.Gao2021_Breast

In [150]:
import os
import pandas as pd
import scanpy as sc

# === Set base path ===
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Gao2021_Breast"

# === Define cancer type folders and human-readable labels ===
cancer_types = {
    "BC": "Breast",
    "BTC": "Breast_and_thyroid"
}

# === Load sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# === Load expression datasets ===
adatas = []

for folder, label in cancer_types.items():
    path = os.path.join(base_path, folder)
    
    # Load expression matrix
    adata = sc.read_mtx(os.path.join(path, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
    
    # Load gene and cell information
    genes = pd.read_csv(os.path.join(path, "Genes.txt"), header=None)[0].tolist()
    cells = pd.read_csv(os.path.join(path, "Cells.csv"), index_col=0)
    
    # Ensure unique cell names by prefixing with folder name
    cells.index = [f"{folder}_{idx}" for idx in cells.index]
    
    # Assign gene names and cell metadata
    adata.var_names = genes
    adata.obs = cells
    adata.obs_names = cells.index
    adata.obs['cancer_folder'] = folder
    adata.obs['cancer_type'] = label
    
    adatas.append(adata)

# === Find common genes ===
common_genes = adatas[0].var_names
for ad in adatas[1:]:
    common_genes = common_genes.intersection(ad.var_names)

# === Subset all datasets to common genes ===
adatas = [ad[:, common_genes].copy() for ad in adatas]

# === Concatenate all datasets with batch labels ===
adata_combined = adatas[0].concatenate(
    *adatas[1:],
    batch_key='batch',
    batch_categories=list(cancer_types.keys()),
    index_unique=None  # Cell names are already unique from above
)

# === Ensure unique cell names ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge with sample metadata ===
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')

# Check merge integrity
assert adata_combined.shape[0] == adata_combined.obs.shape[0], "Mismatch after sample merge"

# Restore cell index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index = adata_combined.obs.index.astype(str)
adata_combined.obs.index.name = None

# === Final AnnData object ready ===
print(adata_combined)


  adata_combined = adatas[0].concatenate(


AnnData object with n_obs × n_vars = 29992 × 22164
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'cancer_folder', 'cancer_type_x', 'batch', 'technology', 'n_cells', 'patient', 'cancer_type_y', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'


In [154]:
adata = adata_combined

In [161]:
adata

AnnData object with n_obs × n_vars = 29992 × 22164
    obs: 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'cancer_folder', 'cancer_type', 'technology', 'n_cells', 'patient', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [158]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS','cancer_type_y'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [160]:
adata.obs = adata.obs.rename(columns={"cancer_type_x": "cancer_type"})

In [6]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [167]:
adata.obs

Unnamed: 0,sample,cell_type,complexity,cell_cycle_phase,cancer_folder,cancer_type,technology,n_cells,patient,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_name,category,study
BC_AAACCTGCAGTGACAG,DCIS1,Malignant,2740,Not cycling,BC,Breast,10X,1480,DCIS1,,,,,,naive,BC_AAACCTGCAGTGACAG,Breast,Gao2021_Breast
BC_AAACCTGGTCGAGATG,DCIS1,Epithelial,3079,Not cycling,BC,Breast,10X,1480,DCIS1,,,,,,naive,BC_AAACCTGGTCGAGATG,Breast,Gao2021_Breast
BC_AAACCTGTCACCGGGT,DCIS1,Endothelial,3243,Not cycling,BC,Breast,10X,1480,DCIS1,,,,,,naive,BC_AAACCTGTCACCGGGT,Breast,Gao2021_Breast
BC_AAACGGGGTGCACTTA,DCIS1,Malignant,3373,Not cycling,BC,Breast,10X,1480,DCIS1,,,,,,naive,BC_AAACGGGGTGCACTTA,Breast,Gao2021_Breast
BC_AAACGGGTCACGGTTA,DCIS1,Malignant,4162,Not cycling,BC,Breast,10X,1480,DCIS1,,,,,,naive,BC_AAACGGGTCACGGTTA,Breast,Gao2021_Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BTC_TTTGGAGGTGTTCGTA,TNBC5,Malignant,6232,Not cycling,BTC,Breast_and_thyroid,10X,3225,TNBC5,,,,,,naive,BTC_TTTGGAGGTGTTCGTA,Breast,Gao2021_Breast
BTC_TTTGGTTGTAGCTTTG,TNBC5,Malignant,7478,G1/S,BTC,Breast_and_thyroid,10X,3225,TNBC5,,,,,,naive,BTC_TTTGGTTGTAGCTTTG,Breast,Gao2021_Breast
BTC_TTTGTTGAGCGTATGG,TNBC5,T_cell,1146,Not cycling,BTC,Breast_and_thyroid,10X,3225,TNBC5,,,,,,naive,BTC_TTTGTTGAGCGTATGG,Breast,Gao2021_Breast
BTC_TTTGTTGCAGGATTCT,TNBC5,Malignant,4643,Not cycling,BTC,Breast_and_thyroid,10X,3225,TNBC5,,,,,,naive,BTC_TTTGTTGCAGGATTCT,Breast,Gao2021_Breast


In [163]:
adata.obs['cell_name'] = adata.obs_names

In [164]:
adata.obs['category'] = 'Breast'

In [165]:
adata.obs['study'] = 'Gao2021_Breast'

In [166]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Gao2021_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Gao2021_Breast.h5ad


#### 11.Griffiths2021_Breast

In [197]:

# Set base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Griffiths2021_Breast"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# === Load iCell8 Data ===
path_ss2 = os.path.join(base_path, "iCell8")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_UMIcounts.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'iCell8'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'iCell8'],
    index_unique=None
)

# === Check uniqueness ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve and reset index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge with sample metadata
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None


  adata_combined = adata_10x.concatenate(


In [198]:
adata = adata_combined

In [199]:
adata

AnnData object with n_obs × n_vars = 111372 × 15691
    obs: 'sample', 'patient_x', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'time_point_x', 'days_treated_x', 'treatment_arm_x', 'treatment_x', 'ribociclib_schedule_x', 'response_x', 'technology_x', 'batch', 'cancer_type', 'technology_y', 'n_cells', 'patient_y', 'time_point_y', 'days_treated_y', 'treatment_arm_y', 'treatment_y', 'ribociclib_schedule_y', 'response_y'

In [200]:
for col in [ 'umap1', 'umap2', 'g1s_score', 'g2m_score',
             'mp_top_score', 'mp_top', 'mp_assignment', 'time_point_x', 
             'days_treated_x', 'treatment_arm_x','ribociclib_schedule_x', 'response_x',
             'nCount_RNA', 'nFeature_RNA', 'percent.mt','batch',
             'technology_y', 'patient_y', 'time_point_y', 'days_treated_y',
             'treatment_arm_y', 'treatment_y', 'ribociclib_schedule_y', 'response_y'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [175]:
adata.obs['treatment_x'].value_counts()

treatment_x
letrozole + ribociclib    63966
letrozole                 47406
Name: count, dtype: int64

In [202]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [203]:
adata.obs = adata.obs.rename(columns={"treatment_x": "treated_naive"})

In [204]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [209]:
adata.obs

Unnamed: 0,sample,patient,cell_type,complexity,cell_cycle_phase,treated_naive,technology,cancer_type,n_cells,cell_name,site,category,study
P11_M_AAACGAACACAAGTGG,P11_M,P11,Malignant,2361,Not cycling,letrozole + ribociclib,10X,Breast Cancer,780,P11_M_AAACGAACACAAGTGG,breast,Breast,Griffiths2021_Breast
P11_M_AAACGCTGTTAAGAAC,P11_M,P11,Malignant,2123,Not cycling,letrozole + ribociclib,10X,Breast Cancer,780,P11_M_AAACGCTGTTAAGAAC,breast,Breast,Griffiths2021_Breast
P11_M_AAAGTGAAGAGGACTC,P11_M,P11,Malignant,1795,Not cycling,letrozole + ribociclib,10X,Breast Cancer,780,P11_M_AAAGTGAAGAGGACTC,breast,Breast,Griffiths2021_Breast
P11_M_AAATGGAGTGTTAACC,P11_M,P11,Malignant,2904,Not cycling,letrozole + ribociclib,10X,Breast Cancer,780,P11_M_AAATGGAGTGTTAACC,breast,Breast,Griffiths2021_Breast
P11_M_AACAACCGTCCTCCTA,P11_M,P11,Malignant,3415,Not cycling,letrozole + ribociclib,10X,Breast Cancer,780,P11_M_AACAACCGTCCTCCTA,breast,Breast,Griffiths2021_Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...
P09_M_C68_R24,P09_M,P09,Malignant,2167,Not cycling,letrozole + ribociclib,iCell8,Breast Cancer,79,P09_M_C68_R24,breast,Breast,Griffiths2021_Breast
P09_M_C68_R55,P09_M,P09,Malignant,3116,Not cycling,letrozole + ribociclib,iCell8,Breast Cancer,79,P09_M_C68_R55,breast,Breast,Griffiths2021_Breast
P09_M_C69_R26,P09_M,P09,Malignant,3104,Not cycling,letrozole + ribociclib,iCell8,Breast Cancer,79,P09_M_C69_R26,breast,Breast,Griffiths2021_Breast
P09_M_C69_R60,P09_M,P09,Malignant,3188,Not cycling,letrozole + ribociclib,iCell8,Breast Cancer,79,P09_M_C69_R60,breast,Breast,Griffiths2021_Breast


In [205]:
adata.obs['cell_name'] = adata.obs_names

In [206]:
adata.obs['site'] = 'breast'

In [207]:
adata.obs['category'] = 'Breast'

In [208]:
adata.obs['study'] = 'Griffiths2021_Breast'

In [210]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Griffiths2021_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Griffiths2021_Breast.h5ad


#### 11.Pal2021_Breast

In [211]:
import os
import pandas as pd
import scanpy as sc
import scipy.io
import gc
from scipy.sparse import vstack

In [213]:
# Base path
base_path = "/home/ubuntu/Downloads/Data_Breast/Data_Pal2021_Breast"

# Step 1: Read gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
gene_names = genes[0].values

# Step 2: Read and concatenate expression matrices and cell metadata
all_exprs = []
all_cells = []

for i in range(1, 6):  # Group1 to Group5
    group_path = os.path.join(base_path, f"Group{i}")
    
    # Load matrix
    mtx_path = os.path.join(group_path, f"Exp_data_UMIcounts{i}.mtx")
    expr = sc.read_mtx(mtx_path).T  # transpose: cells x genes
    all_exprs.append(expr.X)
    
    # Load cell metadata
    cells_path = os.path.join(group_path, f"Cells{i}.csv")
    cells_df = pd.read_csv(cells_path)
    all_cells.append(cells_df)

# Combine all groups
combined_expr = vstack(all_exprs)
combined_cells = pd.concat(all_cells, ignore_index=True)

# Create AnnData object
adata = sc.AnnData(X=combined_expr)
adata.var_names = gene_names
adata.var_names_make_unique()
adata.obs = combined_cells

# Step 3: Read and merge sample metadata
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))
adata.obs = adata.obs.merge(samples, on="sample", how="left")

gc.collect()


4498

In [218]:
adata

AnnData object with n_obs × n_vars = 305224 × 33538
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [217]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'timepoint', 'nCount_RNA', 'nFeature_RNA',
            'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'recent_treatment', 'recent_treatment_response', 'time_elapsed_from_recent_treatment',
            'prior_chemotherapy', 'chemotherapy_response', 'prior_targeted_rx', 'targeted_rx_response',
            'prior_chemoICB', 'chemoICB_response', 'prior_ET', 'ET_response', 'subsequent_treatment',
            'subsequent_treatment_response', 'PFS_DFS', 'OS',
            'chemotherapy_exposed', 'targeted_rx_exposed', 'ICB_exposed', 'ICB_response', 'ET_exposed',
            'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [223]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,category
0,BRCA1_pre_neoplastic_0023_AAACCTGAGAGACTAT-1,BRCA1_pre_neoplastic_0023,,753,,10X,5932,BRCA1_pre_neoplastic_0023,Normal,F,42.0,,P,,,Breast
1,BRCA1_pre_neoplastic_0023_AAACCTGAGGGCTCTC-1,BRCA1_pre_neoplastic_0023,,464,,10X,5932,BRCA1_pre_neoplastic_0023,Normal,F,42.0,,P,,,Breast
2,BRCA1_pre_neoplastic_0023_AAACCTGCAATAACGA-1,BRCA1_pre_neoplastic_0023,,782,,10X,5932,BRCA1_pre_neoplastic_0023,Normal,F,42.0,,P,,,Breast
3,BRCA1_pre_neoplastic_0023_AAACCTGCAGACGTAG-1,BRCA1_pre_neoplastic_0023,,595,,10X,5932,BRCA1_pre_neoplastic_0023,Normal,F,42.0,,P,,,Breast
4,BRCA1_pre_neoplastic_0023_AAACCTGCAGCTTCGG-1,BRCA1_pre_neoplastic_0023,,922,,10X,5932,BRCA1_pre_neoplastic_0023,Normal,F,42.0,,P,,,Breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305219,Triple_negative_BRCA1_4031_TTTGTCAAGTGAACAT-1,Triple_negative_BRCA1_4031,Malignant,2303,G2/M,10X,5581,Triple_negative_BRCA1_4031,Breast Cancer,F,25.0,,P,breast,NAIVE,Breast
305220,Triple_negative_BRCA1_4031_TTTGTCACATCCTTGC-1,Triple_negative_BRCA1_4031,Malignant,2551,Not cycling,10X,5581,Triple_negative_BRCA1_4031,Breast Cancer,F,25.0,,P,breast,NAIVE,Breast
305221,Triple_negative_BRCA1_4031_TTTGTCAGTCCATCCT-1,Triple_negative_BRCA1_4031,,432,,10X,5581,Triple_negative_BRCA1_4031,Breast Cancer,F,25.0,,P,breast,NAIVE,Breast
305222,Triple_negative_BRCA1_4031_TTTGTCATCAACGGGA-1,Triple_negative_BRCA1_4031,Malignant,3758,G2/M,10X,5581,Triple_negative_BRCA1_4031,Breast Cancer,F,25.0,,P,breast,NAIVE,Breast


In [220]:
adata.obs['category'] = 'Breast'

In [224]:
adata.obs['study'] = 'Pal2021_Breast'

In [225]:
output_path = "/home/ubuntu/Downloads/Data_Breast/Data_Pal2021_Breast.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Breast/Data_Pal2021_Breast.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [None]:
# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Breast/Data_Azizi2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Bassez2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Chung2017_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Gao2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Griffiths2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Gulati2020_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Karaayvas2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Kim2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Pal2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Qian2020_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Savas2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Wu2021_Breast.h5ad"
]

gc.collect()
# Load datasets
adatas = [sc.read(file) for file in files]

gc.collect()
# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

gc.collect()
# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

gc.collect()


  concat_annot = pd.concat(


In [2]:
import scanpy as sc
import anndata
import gc
from functools import reduce

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Breast/Data_Azizi2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Bassez2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Chung2017_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Gao2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Griffiths2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Gulati2020_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Karaayvas2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Kim2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Pal2021_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Qian2020_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Savas2018_Breast.h5ad",
    "/home/ubuntu/Downloads/Data_Breast/Data_Wu2021_Breast.h5ad"
]

# Load and clean datasets
adatas = []

for file in files:
    print(f"Reading {file}...")
    adata = sc.read(file)
    
    # Convert object columns to strings to avoid h5py write issues
    for col in adata.obs.columns:
        if adata.obs[col].dtype == 'object':
            adata.obs[col] = adata.obs[col].astype(str)
    
    adatas.append(adata)
    del adata
    gc.collect()

# Merge in pairs to avoid memory issues
def safe_merge_pairwise(adatas):
    while len(adatas) > 1:
        new_adatas = []
        for i in range(0, len(adatas), 2):
            if i + 1 < len(adatas):
                print(f"Merging dataset {i} and {i+1}...")
                merged = anndata.concat([adatas[i], adatas[i + 1]], join="outer", fill_value=0)
            else:
                merged = adatas[i]
            gc.collect()
            new_adatas.append(merged)
        adatas = new_adatas
    return adatas[0]

# Merge all datasets
print("Starting final merge...")
adata_merged = safe_merge_pairwise(adatas)
gc.collect()



Reading /home/ubuntu/Downloads/Data_Breast/Data_Azizi2018_Breast.h5ad...
Reading /home/ubuntu/Downloads/Data_Breast/Data_Bassez2021_Breast.h5ad...




Reading /home/ubuntu/Downloads/Data_Breast/Data_Chung2017_Breast.h5ad...
Reading /home/ubuntu/Downloads/Data_Breast/Data_Gao2021_Breast.h5ad...




Reading /home/ubuntu/Downloads/Data_Breast/Data_Griffiths2021_Breast.h5ad...
Reading /home/ubuntu/Downloads/Data_Breast/Data_Gulati2020_Breast.h5ad...




Reading /home/ubuntu/Downloads/Data_Breast/Data_Karaayvas2018_Breast.h5ad...
Reading /home/ubuntu/Downloads/Data_Breast/Data_Kim2018_Breast.h5ad...




Reading /home/ubuntu/Downloads/Data_Breast/Data_Pal2021_Breast.h5ad...




Reading /home/ubuntu/Downloads/Data_Breast/Data_Qian2020_Breast.h5ad...




Reading /home/ubuntu/Downloads/Data_Breast/Data_Savas2018_Breast.h5ad...
Reading /home/ubuntu/Downloads/Data_Breast/Data_Wu2021_Breast.h5ad...




Starting final merge...
Merging dataset 0 and 1...


  utils.warn_names_duplicates("obs")
  concat_annot = pd.concat(
  concat_annot = pd.concat(
  concat_annot = pd.concat(


Merging dataset 2 and 3...
Merging dataset 4 and 5...
Merging dataset 6 and 7...


  concat_annot = pd.concat(
  utils.warn_names_duplicates("obs")
  concat_annot = pd.concat(
  concat_annot = pd.concat(


Merging dataset 8 and 9...


  utils.warn_names_duplicates("obs")


Merging dataset 10 and 11...


  utils.warn_names_duplicates("obs")


Merging dataset 0 and 1...


  utils.warn_names_duplicates("obs")


Merging dataset 2 and 3...


  utils.warn_names_duplicates("obs")


Merging dataset 4 and 5...


  utils.warn_names_duplicates("obs")
  concat_annot = pd.concat(


Merging dataset 0 and 1...


  utils.warn_names_duplicates("obs")
  concat_annot = pd.concat(


Merging dataset 0 and 1...


  utils.warn_names_duplicates("obs")


OSError: Unable to synchronously create file (unable to truncate a file which is already open)

In [3]:
adata_merged

AnnData object with n_obs × n_vars = 877587 × 75798
    obs: 'sample', 'patient', 'source', 'cell_type', 'cell_subtype', 'complexity', 'technology', 'n_cells', 'cancer_type', 'age', 'cell_name', 'disease_extent', 'study', 'category', 'cell_cycle_phase', 'sex', 'sample_primary_met', 'treated_naive', 'site', 'cancer_folder', 'cancer_subtype'

In [16]:
adata_merged.obs

Unnamed: 0,sample,patient,source,cell_type,cell_subtype,complexity,technology,n_cells,cancer_type,age,cell_name,disease_extent,study,category,cell_cycle_phase,sex,sample_primary_met,treated_naive,site
s1_AAACCTGAGCAGACTG-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1684,10X,6710,Breast Cancer,65,s1_AAACCTGAGCAGACTG-1,non metastatic,Azizi2018_Breast,Breast,,,,,
s1_AAACCTGAGGTCGGAT-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1653,10X,6710,Breast Cancer,65,s1_AAACCTGAGGTCGGAT-1,non metastatic,Azizi2018_Breast,Breast,,,,,
s1_AAACCTGAGTGTACCT-1,BC09_TUMOR1,BC09,breast tumor,T_cell,T-reg,1326,10X,6710,Breast Cancer,65,s1_AAACCTGAGTGTACCT-1,non metastatic,Azizi2018_Breast,Breast,,,,,
s1_AAACCTGAGTGTACTC-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD8 T cell,1463,10X,6710,Breast Cancer,65,s1_AAACCTGAGTGTACTC-1,non metastatic,Azizi2018_Breast,Breast,,,,,
s1_AAACCTGAGTTAAGTG-1,BC09_TUMOR1,BC09,breast tumor,T_cell,CD4 T cell,1288,10X,6710,Breast Cancer,65,s1_AAACCTGAGTTAAGTG-1,non metastatic,Azizi2018_Breast,Breast,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100059,CID4398,CID4398,,Dendritic,DCs,1251,10x,4451,IDC,52.0,CID4398_TCAGGTAGTACTCAAC,,Wu2021_Breast,,Not cycling,Female,,Treated,breast
100060,CID4398,CID4398,,Dendritic,DCs,719,10x,4451,IDC,52.0,CID4398_TCTATTGTCGCCATAA,,Wu2021_Breast,,,Female,,Treated,breast
100061,CID4398,CID4398,,Dendritic,DCs,887,10x,4451,IDC,52.0,CID4398_TCTTTCCCAGTAAGCG,,Wu2021_Breast,,,Female,,Treated,breast
100062,CID4398,CID4398,,Dendritic,DCs,870,10x,4451,IDC,52.0,CID4398_TGCCCATGTTACGGAG,,Wu2021_Breast,,,Female,,Treated,breast


In [6]:
del adata_merged.obs['cancer_folder']

In [7]:
del adata_merged.obs['cancer_subtype']

In [13]:
# Ensure all obs columns are string-safe
for col in adata_merged.obs.columns:
    try:
        # Convert to string, including categorical or object types
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)
    except Exception as e:
        print(f"Warning: Failed to convert column {col} to string. Error: {e}")
        adata_merged.obs.drop(columns=[col], inplace=True)

In [15]:
# Optional: Save to disk
adata_merged.write("/home/ubuntu/Downloads/Data_Breast/Breast_Combined.h5ad")
print("Merged dataset saved.")

Merged dataset saved.


In [18]:
adata_merged.obs['study'].value_counts()

study
Pal2021_Breast          305224
Bassez2021_Breast       226635
Griffiths2021_Breast    111372
Wu2021_Breast           100064
Azizi2018_Breast         75029
Gao2021_Breast           29992
Qian2020_Breast          16537
Savas2018_Breast          6311
Kim2018_Breast            2472
Gulati2020_Breast         1902
Karaayvas2018_Breast      1534
Chung2017_Breast           515
Name: count, dtype: int64