In [1]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc
import scipy.io
from scipy.sparse import vstack

## Ovarian

#### 1. Geistlinger2020_Ovarian

In [2]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Geistlinger2020_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect()

40

In [8]:
adata

AnnData object with n_obs × n_vars = 41031 × 15328
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'cancer_type', 'technology', 'n_cells'

In [7]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'hpca.celltype', 'encode.celltype', 'tumor_stage_x', 'subtype',
            'tumor_grade_x', 'ct_response_x', 'patient_y', 'tumor_stage_y', 'tumor_grade_y', 
            'ct_response_y', 'histology'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [6]:
adata.obs['subtype'].value_counts()

subtype
DIF    30619
MES     4590
IMR     4078
PRO     1744
Name: count, dtype: int64

In [36]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,cancer_type,technology,n_cells,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,study,category
0,AAACCTGAGCTGCCCA-1_1,T59,T59,Macrophage,Macrophage,799,,Ovarian Cancer,10x,12659,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
1,AAACCTGAGTCATCCA-1_2,T59,T59,Macrophage,Macrophage,1022,Not cycling,Ovarian Cancer,10x,12659,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
2,AAACCTGCAAGCCCAC-1_3,T59,T59,Macrophage,Macrophage,1036,Not cycling,Ovarian Cancer,10x,12659,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
3,AAACCTGCAAGCGCTC-1_4,T59,T59,Malignant,Malignant,2571,G1/S,Ovarian Cancer,10x,12659,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
4,AAACCTGCACGTAAGG-1_5,T59,T59,Malignant,Malignant,3402,Intermediate,Ovarian Cancer,10x,12659,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41026,TTTGTCACACAGCCCA-1_41027,T90,T90,T_cell,T_cell,908,,Ovarian Cancer,10x,3630,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
41027,TTTGTCACATTCTTAC-1_41028,T90,T90,Malignant,Malignant,3321,G1/S,Ovarian Cancer,10x,3630,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
41028,TTTGTCAGTAGCTGCC-1_41029,T90,T90,Fibroblast,Fibroblast,2583,Not cycling,Ovarian Cancer,10x,3630,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian
41029,TTTGTCAGTTGAGGTG-1_41030,T90,T90,Fibroblast,Fibroblast,3401,Not cycling,Ovarian Cancer,10x,3630,,,,,Ovarian,,,Geistlinger2020_Ovarian,Ovarian


In [34]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [10]:
adata.obs['sex'] = 'NaN'

In [11]:
adata.obs['age'] = 'NaN'

In [12]:
adata.obs['disease_extent'] = 'NaN'

In [13]:
adata.obs['sample_primary_met'] = 'NaN'

In [14]:
adata.obs['site'] = 'Ovarian'

In [15]:
adata.obs['treated_naive'] = 'NaN'

In [16]:
adata.obs['source'] = 'NaN'

In [17]:
adata.obs['study'] = 'Geistlinger2020_Ovarian'

In [18]:
adata.obs['category'] = 'Ovarian'

In [35]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Geistlinger2020_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Geistlinger2020_Ovarian.h5ad


#### 2.Izar2020_Ovarian

In [6]:

# Set base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Izar2020_Ovarian"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_TPM.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# === Load SmartSeq2 Data ===
path_ss2 = os.path.join(base_path, "SmartSeq2")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_TPM.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'SmartSeq2'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'SmartSeq2'],
    index_unique=None
)

# === Check uniqueness ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve and reset index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge with sample metadata
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None


  adata_combined = adata_10x.concatenate(


In [8]:
adata = adata_combined

In [24]:
adata

AnnData object with n_obs × n_vars = 10788 × 10869
    obs: 'sample', 'patient', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive', 'source', 'cell_subtype', 'study', 'category'

In [13]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'cluster_old', 'cluster_new', 'batch', 'technology_y',
            'patient_y', 
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [26]:
adata.obs

Unnamed: 0,sample,patient,cell_type,complexity,cell_cycle_phase,technology,n_cells,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,cell_subtype,study,category,cell_name
10x_3288_t1_AAACATACCTTCCG-1,3288.1,5.0,Macrophage,839,,10X,4929,Ovarian Cancer,False,,metastatic,met,ascites,treated,,,Izar2020_Ovarian,Ovarian,10x_3288_t1_AAACATACCTTCCG-1
10x_3288_t1_AAACATACTCCTAT-1,3288.1,5.0,Macrophage,1196,Not cycling,10X,4929,Ovarian Cancer,False,,metastatic,met,ascites,treated,,,Izar2020_Ovarian,Ovarian,10x_3288_t1_AAACATACTCCTAT-1
10x_3288_t1_AAACATTGAACTGC-1,3288.1,5.0,Macrophage,1343,Not cycling,10X,4929,Ovarian Cancer,False,,metastatic,met,ascites,treated,,,Izar2020_Ovarian,Ovarian,10x_3288_t1_AAACATTGAACTGC-1
10x_3288_t1_AAACATTGCTGACA-1,3288.1,5.0,Fibroblast,2358,Not cycling,10X,4929,Ovarian Cancer,False,,metastatic,met,ascites,treated,,,Izar2020_Ovarian,Ovarian,10x_3288_t1_AAACATTGCTGACA-1
10x_3288_t1_AAACCGTGACAGTC-1,3288.1,5.0,Fibroblast,2022,Not cycling,10X,4929,Ovarian Cancer,False,,metastatic,met,ascites,treated,,,Izar2020_Ovarian,Ovarian,10x_3288_t1_AAACCGTGACAGTC-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SS2_1293,5.0,,Malignant,5854,Not cycling,SmartSeq2,155,Ovarian Cancer,False,,metastatic,met,ascites,naive,,,Izar2020_Ovarian,Ovarian,SS2_1293
SS2_1294,5.0,,Malignant,4373,G1/S,SmartSeq2,155,Ovarian Cancer,False,,metastatic,met,ascites,naive,,,Izar2020_Ovarian,Ovarian,SS2_1294
SS2_1295,5.0,,Malignant,6606,G1/S,SmartSeq2,155,Ovarian Cancer,False,,metastatic,met,ascites,naive,,,Izar2020_Ovarian,Ovarian,SS2_1295
SS2_1296,5.0,,Malignant,7352,Not cycling,SmartSeq2,155,Ovarian Cancer,False,,metastatic,met,ascites,naive,,,Izar2020_Ovarian,Ovarian,SS2_1296


In [25]:
adata.obs['cell_name'] = adata.obs_names

In [16]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [17]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [20]:
adata.obs['cell_subtype'] = 'NaN'

In [19]:
adata.obs['source'] = 'NaN'

In [21]:
adata.obs['study'] = 'Izar2020_Ovarian'

In [22]:
adata.obs['category'] = 'Ovarian'

In [27]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Izar2020_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Izar2020_Ovarian.h5ad


#### 3.Nath2021_Ovarian

In [28]:

# Set base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Nath2021_Ovarian"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# === Load iCell8 Data ===
path_ss2 = os.path.join(base_path, "iCell8")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_UMIcounts.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'iCell8'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'iCell8'],
    index_unique=None
)

# === Check uniqueness ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve and reset index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge with sample metadata
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None


  adata_combined = adata_10x.concatenate(


In [30]:
adata = adata_combined

In [33]:
adata

AnnData object with n_obs × n_vars = 41729 × 18392
    obs: 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'technology_x', 'batch', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [40]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'batch',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [45]:
adata.obs

Unnamed: 0,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_name,source,category,study
P04_Time3_AAACCCAAGGCCTGAA,P04,T_cell,CD8+ T cell,1099,Not cycling,10X,5989,P04,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P04_Time3_AAACCCAAGGCCTGAA,,Ovarian,Nath2021_Ovarian
P04_Time3_AAACGAAGTTCGTAAC,P04,T_cell,CD8+ T cell,1519,Not cycling,10X,5989,P04,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P04_Time3_AAACGAAGTTCGTAAC,,Ovarian,Nath2021_Ovarian
P04_Time3_AAACGAATCTAGAGCT,P04,B_cell,B_cell,1931,Not cycling,10X,5989,P04,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P04_Time3_AAACGAATCTAGAGCT,,Ovarian,Nath2021_Ovarian
P04_Time3_AAACGCTCAGGCTCTG,P04,T_cell,CD8+ T cell,1148,Not cycling,10X,5989,P04,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P04_Time3_AAACGCTCAGGCTCTG,,Ovarian,Nath2021_Ovarian
P04_Time3_AAACGCTTCCGATAGT,P04,T_cell,CD8+ T cell,1732,Not cycling,10X,5989,P04,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P04_Time3_AAACGCTTCCGATAGT,,Ovarian,Nath2021_Ovarian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P03_Time1_C55_R42,P03,Malignant,,1455,Not cycling,iCell8,51,P03,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P03_Time1_C55_R42,,Ovarian,Nath2021_Ovarian
P03_Time1_C56_R37,P03,Malignant,,1657,Not cycling,iCell8,51,P03,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P03_Time1_C56_R37,,Ovarian,Nath2021_Ovarian
P03_Time1_C57_R00,P03,Malignant,,1780,Not cycling,iCell8,51,P03,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P03_Time1_C57_R00,,Ovarian,Nath2021_Ovarian
P03_Time1_C58_R06,P03,Malignant,,1416,Not cycling,iCell8,51,P03,Ovarian Cancer,Female,,metastatic,met,ascites,treated,P03_Time1_C58_R06,,Ovarian,Nath2021_Ovarian


In [35]:
adata.obs['cell_name'] = adata.obs_names

In [37]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [42]:
adata.obs['source'] = 'NaN'

In [43]:
adata.obs['category'] = 'Ovarian'

In [44]:
adata.obs['study'] = 'Nath2021_Ovarian'

In [46]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Nath2021_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Nath2021_Ovarian.h5ad


#### 4. Olalekan2021_Ovarian

In [47]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Olalekan2021_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [51]:
adata

AnnData object with n_obs × n_vars = 9885 × 16041
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [49]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS',   
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [59]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,complexity,cell_cycle_phase,technology,n_cells,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,cell_subtype,category,study
0,omentum2834__AAAAATATTACC,omentum2834,PT-6,Malignant,1050,Not cycling,Drop-seq,1909,Ovarian Cancer,False,62,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
1,omentum2834__AAAAATTCGAAC,omentum2834,PT-6,Fibroblast,1027,Not cycling,Drop-seq,1909,Ovarian Cancer,False,62,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
2,omentum2834__AAAACGGAATGT,omentum2834,PT-6,Fibroblast,1547,Not cycling,Drop-seq,1909,Ovarian Cancer,False,62,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
3,omentum2834__AAAACTCCGCAG,omentum2834,PT-6,ESC,1272,Not cycling,Drop-seq,1909,Ovarian Cancer,False,62,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
4,omentum2834__AAAATGACTGGA,omentum2834,PT-6,Fibroblast,1169,Not cycling,Drop-seq,1909,Ovarian Cancer,False,62,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9880,omentum6885__TTTTATAAGTGA,omentum6885,PT-3,Fibroblast,799,,Drop-seq,1071,Carcinosarcoma,False,66,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
9881,omentum6885__TTTTCAGCGCTC,omentum6885,PT-3,Fibroblast,1167,Not cycling,Drop-seq,1071,Carcinosarcoma,False,66,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
9882,omentum6885__TTTTGAACGGAC,omentum6885,PT-3,Fibroblast,1855,Not cycling,Drop-seq,1071,Carcinosarcoma,False,66,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian
9883,omentum6885__TTTTGTCGACCG,omentum6885,PT-3,Malignant,2347,Not cycling,Drop-seq,1071,Carcinosarcoma,False,66,metastatic,met,omentum,treated,,,Ovarian,Olalekan2021_Ovarian


In [53]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [55]:
adata.obs['source'] = 'NaN'

In [56]:
adata.obs['cell_subtype'] = 'NaN'

In [57]:
adata.obs['category'] = 'Ovarian'

In [58]:
adata.obs['study'] = 'Olalekan2021_Ovarian'

In [60]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Olalekan2021_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Olalekan2021_Ovarian.h5ad


#### 5.Olbrecht2021_Ovarian

In [61]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Olbrecht2021_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [67]:
adata

AnnData object with n_obs × n_vars = 15528 × 33694
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'sample_site', 'patient_y', 'cancer_type', 'cancer_subtype', 'n_cells', 'technology', 'sex.', 'age__at_sampling', 'primary_met', 'site', 'Necrosis', 'treated_naive'

In [65]:
adata.obs['T_sampling'].value_counts()

Series([], Name: count, dtype: int64)

In [75]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'extent_sampling.',
            'diagnosis_recurrence', 'AJCC_sampling', 'T_sampling', 'N_sampling', 'M_sampling',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'size', 
            'histology', 'hist_subtype', 'subtype_details', 'mutation_status', 'Grade',
            'recent_rx_modality', 'recent_rx_regimen', 'recent_rx_response', 'time_on_recent_rx',
            'on_off_rx', 'prior_chemotherapy', 'chemotherapy_regimen', 'chemotherapy_response',
            'prior_targeted_rx', 'targeted_rx_regimen', 'targeted_rx_response', 'prior_ICI',
            'ICI_regimen', 'ICI_response', 'prior_chemo_ICI', 'chemo_ICI_regimen', 'chemo_ICI_response', 'prior_ET',
            'ET_Regimen', 'ET_response', 'subsqnt_rx', 'subsqnt_rx_modality', 'subsqnt_rx_regimen',
            'subsqnt_rx_response', 'PFS_DFS', 'OS', 'patient_y', 'Necrosis', 'site'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [45]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,site,cancer_type,cancer_subtype,n_cells,technology,sex,age,sample_primary_met,treated_naive,study,category
0,AAACCTGTCTCTAAGG_SOL1303,P1_omentum_tumor,P1,Fibroblast,Fibroblast,3892,Not cycling,tumor,omentum,Ovarian Cancer,HGSTOC,1403,10X,F,70-79,,Naive,Olbrecht2021_Ovarian,Ovarian
1,AAACGGGCAGCTCCGA_SOL1303,P1_omentum_tumor,P1,Endothelial,Endothelial,5666,Not cycling,tumor,omentum,Ovarian Cancer,HGSTOC,1403,10X,F,70-79,,Naive,Olbrecht2021_Ovarian,Ovarian
2,AAACGGGGTATAAACG_SOL1303,P1_omentum_tumor,P1,Fibroblast,Fibroblast,2726,Not cycling,tumor,omentum,Ovarian Cancer,HGSTOC,1403,10X,F,70-79,,Naive,Olbrecht2021_Ovarian,Ovarian
3,AAACGGGTCTTGCCGT_SOL1303,P1_omentum_tumor,P1,Fibroblast,Fibroblast,2604,Not cycling,tumor,omentum,Ovarian Cancer,HGSTOC,1403,10X,F,70-79,,Naive,Olbrecht2021_Ovarian,Ovarian
4,AAAGCAAGTCGCGGTT_SOL1303,P1_omentum_tumor,P1,Fibroblast,Fibroblast,3432,Not cycling,tumor,omentum,Ovarian Cancer,HGSTOC,1403,10X,F,70-79,,Naive,Olbrecht2021_Ovarian,Ovarian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15523,TTTGGTTAGAACAATC_SOL016,P7_peritoneum_tumor,P7,Malignant,Malignant,1314,Not cycling,tumor,peritoneum,Ovarian Cancer,HGSTOC,1472,10X,F,60-69,,Naive,Olbrecht2021_Ovarian,Ovarian
15524,TTTGGTTAGGAACTGC_SOL016,P7_peritoneum_tumor,P7,Malignant,Malignant,1124,Not cycling,tumor,peritoneum,Ovarian Cancer,HGSTOC,1472,10X,F,60-69,,Naive,Olbrecht2021_Ovarian,Ovarian
15525,TTTGGTTAGGCTAGCA_SOL016,P7_peritoneum_tumor,P7,Malignant,Malignant,2473,Not cycling,tumor,peritoneum,Ovarian Cancer,HGSTOC,1472,10X,F,60-69,,Naive,Olbrecht2021_Ovarian,Ovarian
15526,TTTGGTTCATCACGAT_SOL016,P7_peritoneum_tumor,P7,Malignant,Malignant,1266,Not cycling,tumor,peritoneum,Ovarian Cancer,HGSTOC,1472,10X,F,60-69,,Naive,Olbrecht2021_Ovarian,Ovarian


In [71]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [44]:
adata.obs = adata.obs.rename(columns={"sex.": "sex"})

In [73]:
adata.obs = adata.obs.rename(columns={"age__at_sampling": "age"})

In [79]:
adata.obs = adata.obs.rename(columns={"primary_met": "sample_primary_met"})

In [77]:
adata.obs = adata.obs.rename(columns={"sample_site": "site"})

In [80]:
adata.obs['study'] = 'Olbrecht2021_Ovarian'

In [81]:
adata.obs['category'] = 'Ovarian'

In [46]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Olbrecht2021_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Olbrecht2021_Ovarian.h5ad


#### 6.Qian2020_Ovarian

In [85]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Qian2020_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [88]:
adata

AnnData object with n_obs × n_vars = 16951 × 22276
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'source', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [87]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [89]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,BT1303_AAACCTGAGTGCAAGC,11,Fibroblast,1405,Not cycling,Omentum,10x,7063,OvC_1,Ovarian Cancer,Female,,,primary,,naive
1,BT1303_AAACCTGAGTGGTAGC,11,Fibroblast,1035,Not cycling,Omentum,10x,7063,OvC_1,Ovarian Cancer,Female,,,primary,,naive
2,BT1303_AAACCTGTCTCTAAGG,11,Fibroblast,3832,Not cycling,Omentum,10x,7063,OvC_1,Ovarian Cancer,Female,,,primary,,naive
3,BT1303_AAACGGGCAGCTCCGA,11,Endothelial,5557,Not cycling,Omentum,10x,7063,OvC_1,Ovarian Cancer,Female,,,primary,,naive
4,BT1303_AAACGGGCAGGCTGAA,11,Macrophage,1504,Not cycling,Omentum,10x,7063,OvC_1,Ovarian Cancer,Female,,,primary,,naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16946,scrSOL004_TTTGTCAGTTCGCGAC,14,Malignant,3041,G1/S,Peritoneum,10x,4232,OvC_4,Ovarian Cancer,Female,,,primary,,naive
16947,scrSOL004_TTTGTCATCACGAAGG,14,Malignant,2519,G1/S,Peritoneum,10x,4232,OvC_4,Ovarian Cancer,Female,,,primary,,naive
16948,scrSOL004_TTTGTCATCCAACCAA,14,Malignant,2441,Not cycling,Peritoneum,10x,4232,OvC_4,Ovarian Cancer,Female,,,primary,,naive
16949,scrSOL004_TTTGTCATCCGAACGC,14,Endothelial,3087,Not cycling,Peritoneum,10x,4232,OvC_4,Ovarian Cancer,Female,,,primary,,naive


In [90]:
adata.obs['cell_subtype'] = 'NaN'

In [91]:
adata.obs['category'] = 'Ovarian'

In [92]:
adata.obs['study'] = 'Qian2020_Ovarian'

In [93]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Qian2020_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Qian2020_Ovarian.h5ad


#### 7.Regner2021_Ovarian

In [94]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Regner2021_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [95]:
adata

AnnData object with n_obs × n_vars = 65144 × 24516
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'disease', 'histology_x', 'source_x', 'stage_x', 'cancer_type', 'technology', 'n_cells', 'histology_y', 'source_y', 'stage_y', 'grade', 'age_at_diagnosis', 'race', 'bmi'

In [99]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'histology_x','stage_x','histology_y', 'source_y', 'stage_y', 'grade',
            'race', 'bmi','disease'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [108]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source_x,cancer_type,technology,n_cells,age,patient,sex,disease_extent,sample_primary_met,treated_naive,site
0,AAACCCACATTGACAC-1_1,1,Malignant,Unciliated epithelia 1,4226,Not cycling,Endometrium,Endometrial Cancer,10x,4202,70,1,,,,,
1,AAACCCAGTGACTAAA-1_1,1,B_cell,B cell,1519,Not cycling,Endometrium,Endometrial Cancer,10x,4202,70,1,,,,,
2,AAACCCATCTCCGCAT-1_1,1,Endothelial,Endothelia,1731,Not cycling,Endometrium,Endometrial Cancer,10x,4202,70,1,,,,,
3,AAACGAAAGGTTCAGG-1_1,1,Endothelial,Endothelia,2311,Not cycling,Endometrium,Endometrial Cancer,10x,4202,70,1,,,,,
4,AAACGAAGTTAAGTCC-1_1,1,Myocyte,Smooth muscle cells,1036,Not cycling,Endometrium,Endometrial Cancer,10x,4202,70,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65139,TTTGTTGGTCCTTGTC-1_11,9,Malignant,Epithelial cell,4028,Not cycling,Ovary,Ovarian Cancer,10x,5928,59,9,,,,,
65140,TTTGTTGGTCTTTCAT-1_11,9,Fibroblast,Fibroblast,3066,Not cycling,Ovary,Ovarian Cancer,10x,5928,59,9,,,,,
65141,TTTGTTGGTTCCGTTC-1_11,9,Malignant,Epithelial cell,1110,Not cycling,Ovary,Ovarian Cancer,10x,5928,59,9,,,,,
65142,TTTGTTGTCATGAGGG-1_11,9,Macrophage,Macrophage,1240,Not cycling,Ovary,Ovarian Cancer,10x,5928,59,9,,,,,


In [101]:
adata.obs['patient'] = adata.obs['sample'].copy()

In [102]:
adata.obs['sex'] = 'NaN'

In [103]:
adata.obs = adata.obs.rename(columns={"age_at_diagnosis": "age"})

In [109]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [104]:
adata.obs['disease_extent'] = 'NaN'

In [105]:
adata.obs['sample_primary_met'] = 'NaN'

In [106]:
adata.obs['treated_naive'] = 'NaN'

In [107]:
adata.obs['site'] = 'NaN'

In [110]:
adata.obs['category'] = 'Ovarian'

In [111]:
adata.obs['study'] = 'Regner2021_Ovarian'

In [112]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Regner2021_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Regner2021_Ovarian.h5ad


#### 8.Shih2018_Ovarian

In [113]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Shih2018_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [118]:
adata

AnnData object with n_obs × n_vars = 3066 × 26364
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'source_x', 'cancer_type_x', 'complexity', 'site', 'technology', 'n_cells', 'sex', 'age', 'neoadjuvant_treatment'

In [117]:
for col in ['patient_y', 'source_y','cancer_type_y', 'histology',
             'race', 'stage', 'previous_hx_breast_cancer', 
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [119]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [120]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [121]:
adata.obs = adata.obs.rename(columns={"cancer_type_x": "cancer_type"})

In [124]:
adata.obs['treated_naive'] = adata.obs['neoadjuvant_treatment'].map({
    'no': 'naive',
    'yes': 'treated'
})


In [125]:
del adata.obs['neoadjuvant_treatment']

In [123]:
adata.obs['neoadjuvant_treatment'].value_counts()

neoadjuvant_treatment
no     3001
yes      65
Name: count, dtype: int64

In [129]:
adata.obs['disease_extent'] = adata.obs['source'].copy()

In [131]:
adata.obs['sample_primary_met'] = 'NaN'

In [138]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [136]:
adata.obs['cell_subtype'] = 'NaN'

In [139]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,source,cancer_type,complexity,site,technology,n_cells,sex,age,treated_naive,disease_extent,sample_primary_met,study,category,cell_subtype,cell_cycle_phase
0,PN1-P_aacagctgaattagcacg,PN1-P,PN1,,primary,Peritoneal Cancer,286,ovary,,100,female,55,naive,primary,,Data_Shih2018_Ovarian,Ovarian,,
1,PN1-P_aagccactaggtaacagc,PN1-P,PN1,,primary,Peritoneal Cancer,61,ovary,,100,female,55,naive,primary,,Data_Shih2018_Ovarian,Ovarian,,
2,PN1-P_aagccatcgcctgagctt,PN1-P,PN1,,primary,Peritoneal Cancer,133,ovary,,100,female,55,naive,primary,,Data_Shih2018_Ovarian,Ovarian,,
3,PN1-P_aagtatcagacttccaag,PN1-P,PN1,,primary,Peritoneal Cancer,138,ovary,,100,female,55,naive,primary,,Data_Shih2018_Ovarian,Ovarian,,
4,PN1-P_acaaggtgagacagatgt,PN1-P,PN1,,primary,Peritoneal Cancer,393,ovary,,100,female,55,naive,primary,,Data_Shih2018_Ovarian,Ovarian,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3061,HG1-M_gtttcaccacgccaccac,HG1-M,HG1,,metastasis,Ovarian Cancer,197,omentum,,330,female,70,naive,metastasis,,Data_Shih2018_Ovarian,Ovarian,,
3062,HG1-M_ttaagagcgaattgagac,HG1-M,HG1,,metastasis,Ovarian Cancer,235,omentum,,330,female,70,naive,metastasis,,Data_Shih2018_Ovarian,Ovarian,,
3063,HG1-M_atccggatccggcggtcc,HG1-M,HG1,,metastasis,Ovarian Cancer,197,omentum,,330,female,70,naive,metastasis,,Data_Shih2018_Ovarian,Ovarian,,
3064,HG1-M_gtcctatccaagtgtgta,HG1-M,HG1,,metastasis,Ovarian Cancer,230,omentum,,330,female,70,naive,metastasis,,Data_Shih2018_Ovarian,Ovarian,,


In [134]:
adata.obs['category'] = 'Ovarian'

In [133]:
adata.obs['study'] = 'Data_Shih2018_Ovarian'

In [140]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Shih2018_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Shih2018_Ovarian.h5ad


#### 9.Tang-Huau2018_Ovarian

In [154]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Tang-Huau2018_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [142]:
adata

AnnData object with n_obs × n_vars = 8404 × 20939
    obs: 'cell_name', 'sample', 'patient_x', 'source_x', 'cancer_type_x', 'sorting', 'cell_type', 'complexity', 'patient_y', 'source_y', 'cancer_type_y', 'n_cells', 'technology'

In [155]:
for col in ['sorting', 'patient_y', 'source_y', 'cancer_type_y', 
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [161]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cancer_type,cell_type,complexity,n_cells,technology,category,study
0,AAACCTGCATCTCCCA-1,d1,d1,healthy blood,Normal,Monocyte,1664,425,10x,Ovarian,Tang-Huau2018_Ovarian
1,AAACGGGCAACTGCGC-1,d1,d1,healthy blood,Normal,Monocyte,478,425,10x,Ovarian,Tang-Huau2018_Ovarian
2,AAAGTAGCAATCACAC-1,d1,d1,healthy blood,Normal,Monocyte,1118,425,10x,Ovarian,Tang-Huau2018_Ovarian
3,AAATGCCCATGTCGAT-1,d1,d1,healthy blood,Normal,Monocyte,752,425,10x,Ovarian,Tang-Huau2018_Ovarian
4,AACACGTTCACATACG-1,d1,d1,healthy blood,Normal,Monocyte,1126,425,10x,Ovarian,Tang-Huau2018_Ovarian
...,...,...,...,...,...,...,...,...,...,...,...
8399,TTTGTCACACTGTTAG-5,donD,donD,healthy tonsil,Normal,Dendritic,6476,2739,10x,Ovarian,Tang-Huau2018_Ovarian
8400,TTTGTCAGTGTGTGCC-5,donD,donD,healthy tonsil,Normal,Dendritic,2559,2739,10x,Ovarian,Tang-Huau2018_Ovarian
8401,TTTGTCATCATTTGGG-5,donD,donD,healthy tonsil,Normal,Dendritic,2286,2739,10x,Ovarian,Tang-Huau2018_Ovarian
8402,TTTGTCATCCGCAGTG-5,donD,donD,healthy tonsil,Normal,Dendritic,2147,2739,10x,Ovarian,Tang-Huau2018_Ovarian


In [156]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [157]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [158]:
adata.obs = adata.obs.rename(columns={"cancer_type_x": "cancer_type"})

In [159]:
adata.obs['category'] = 'Ovarian'

In [160]:
adata.obs['study'] = 'Tang-Huau2018_Ovarian'

In [162]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Tang-Huau2018_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Tang-Huau2018_Ovarian.h5ad


#### 10.Zhang2019_Ovarian

In [2]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2019_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [7]:
adata

AnnData object with n_obs × n_vars = 4848 × 24410
    obs: 'cell_name', 'sample', 'patient', 'cell_type', 'complexity', 'cell_cycle_phase', 'source', 'cancer_type', 'technology', 'n_cells'

In [4]:
for col in [ 'umap1', 'umap2', 'g1s_score', 'g2m_score',
             'mp_top_score', 'mp_top', 'mp_assignment', 'patient_y', 'source_y'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [5]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [6]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [8]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,complexity,cell_cycle_phase,source,cancer_type,technology,n_cells
0,AAACCTGAGAGACGAA_VOA11543L,VOA11543L,VOA11543,Fibroblast,4714,Not cycling,Left ovary,Ovarian Cancer,10x,2707
1,AAACCTGAGAGTCTGG_VOA11543L,VOA11543L,VOA11543,Fibroblast,2103,Not cycling,Left ovary,Ovarian Cancer,10x,2707
2,AAACCTGAGATCCCGC_VOA11543L,VOA11543L,VOA11543,Malignant,5107,Not cycling,Left ovary,Ovarian Cancer,10x,2707
3,AAACCTGCACAGGAGT_VOA11543L,VOA11543L,VOA11543,Malignant,4820,Not cycling,Left ovary,Ovarian Cancer,10x,2707
4,AAACCTGGTACTTGAC_VOA11543L,VOA11543L,VOA11543,T_cell,3452,,Left ovary,Ovarian Cancer,10x,2707
...,...,...,...,...,...,...,...,...,...,...
4843,TTTGGTTGTGTAACGG_VOA11543R,VOA11543R,VOA11543,Malignant,2946,Not cycling,Right ovary,Ovarian Cancer,10x,2141
4844,TTTGGTTGTTCATGGT_VOA11543R,VOA11543R,VOA11543,Malignant,4321,Not cycling,Right ovary,Ovarian Cancer,10x,2141
4845,TTTGGTTTCATAGCAC_VOA11543R,VOA11543R,VOA11543,Malignant,5292,Not cycling,Right ovary,Ovarian Cancer,10x,2141
4846,TTTGGTTTCTTGTCAT_VOA11543R,VOA11543R,VOA11543,Malignant,4157,Not cycling,Right ovary,Ovarian Cancer,10x,2141


In [9]:
adata.obs['site'] = adata.obs['source']

In [10]:
adata.obs['category'] = 'Ovarian'

In [11]:
adata.obs['study'] = 'Data_Zhang2019_Ovarian'

In [12]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2019_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2019_Ovarian.h5ad


#### 11.Zhang2022_Ovarian

In [13]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2022_Ovarian"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [20]:
adata

AnnData object with n_obs × n_vars = 51786 × 32847
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'treatment_phase_x', 'anatomical_location_x', 'cancer_type', 'technology', 'n_cells'

In [19]:
for col in [ 'umap1', 'umap2', 'g1s_score', 'g2m_score',
             'mp_top_score', 'mp_top', 'mp_assignment', 'patient_y',
             'nCount_RNA', 'nFeature_RNA', 'percent.mt',
             'treatment_phase_y', 'anatomical_location_y', 'histology'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [18]:
adata.obs['treatment_phase_x'].value_counts()

treatment_phase_x
post-NACT          30025
treatment-naive    21761
Name: count, dtype: int64

In [22]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [23]:
adata.obs = adata.obs.rename(columns={"treatment_phase_x": "treated_naive"})

In [24]:
adata.obs = adata.obs.rename(columns={"anatomical_location_x": "site"})

In [25]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,treated_naive,site,cancer_type,technology,n_cells
0,AAACCTGCAGGTTTCA-EOC372_pPer,EOC372_primary_Peritoneum,EOC372,Malignant,EOC_C11,5700,Not cycling,treatment-naive,Peritoneum,Ovarian Cancer,10x,711
1,AAACCTGGTCCGAATT-EOC372_pPer,EOC372_primary_Peritoneum,EOC372,Malignant,EOC_C12,3196,Not cycling,treatment-naive,Peritoneum,Ovarian Cancer,10x,711
2,AAAGATGCATCTGGTA-EOC372_pPer,EOC372_primary_Peritoneum,EOC372,Malignant,EOC_C5,2148,Not cycling,treatment-naive,Peritoneum,Ovarian Cancer,10x,711
3,AAAGTAGTCGCTTAGA-EOC372_pPer,EOC372_primary_Peritoneum,EOC372,Malignant,EOC_C4,3438,Not cycling,treatment-naive,Peritoneum,Ovarian Cancer,10x,711
4,AAATGCCAGGTGCACA-EOC372_pPer,EOC372_primary_Peritoneum,EOC372,Malignant,EOC_C5,2839,G1/S,treatment-naive,Peritoneum,Ovarian Cancer,10x,711
...,...,...,...,...,...,...,...,...,...,...,...,...
51781,TTTGCGCCACATCCAA-EOC443_pOme,EOC443_primary_Omentum,EOC443,Plasma,Plasma-cells,1398,Not cycling,treatment-naive,Omentum,Ovarian Cancer,10x,2122
51782,TTTGCGCCACGTCAGC-EOC443_pOme,EOC443_primary_Omentum,EOC443,Macrophage,Macrophages,1283,Not cycling,treatment-naive,Omentum,Ovarian Cancer,10x,2122
51783,TTTGCGCCATTCACTT-EOC443_pOme,EOC443_primary_Omentum,EOC443,Plasma,Plasma-cells,1395,Not cycling,treatment-naive,Omentum,Ovarian Cancer,10x,2122
51784,TTTGTCACATTGGGCC-EOC443_pOme,EOC443_primary_Omentum,EOC443,T_cell,T-cells,699,,treatment-naive,Omentum,Ovarian Cancer,10x,2122


In [26]:
adata.obs['category'] = 'Ovarian'

In [27]:
adata.obs['study'] = 'Zhang2022_Ovarian'

In [28]:
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2022_Ovarian.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2022_Ovarian.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [47]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Geistlinger2020_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Izar2020_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Nath2021_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Olalekan2021_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Olbrecht2021_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Qian2020_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Regner2021_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Shih2018_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Tang-Huau2018_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2019_Ovarian.h5ad",
    "/home/ubuntu/Downloads/Data_Ovarian/Data_Zhang2022_Ovarian.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

gc.collect()
# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

gc.collect()
# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

gc.collect()


  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Ovarian/Ovarian_Combined.h5ad


In [53]:
adata_merged

AnnData object with n_obs × n_vars = 269160 × 51957
    obs: 'cell_name', 'sample', 'patient', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'cancer_type', 'technology', 'n_cells', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive', 'source', 'study', 'category'

In [49]:
del adata_merged.obs['cancer_subtype']

In [54]:
# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Ovarian/Ovarian_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")

✅ Merged and saved to: /home/ubuntu/Downloads/Data_Ovarian/Ovarian_Combined.h5ad
