In [11]:
import pandas as pd
import scanpy as sc
from scipy.io import mmread
import os

### Data extraction

#### Sarcoma

In [12]:
# Define base path
base_path = "/home/ubuntu/Downloads/Data_Sarcoma/Data_Zhou2020_Sarcoma"

# Load files
X = mmread(os.path.join(base_path, "Exp_data_UMIcounts.mtx")).T.tocsr()

genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)[0].tolist()
cells_df = pd.read_csv(os.path.join(base_path, "Cells.csv"), index_col=0)
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Create AnnData
adata = sc.AnnData(X)
adata.var_names = genes
adata.obs = cells_df

# Merge sample-level metadata into obs using 'sample'
adata.obs = adata.obs.merge(samples_df, how="left", on="sample")


In [15]:
adata.write("Zhou2020_Sarcoma.h5ad")

... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'source' as categorical
... storing 'malignant' as categorical
... storing 'technology' as categorical
... storing 'patient' as categorical
... storing 'cancer_type' as categorical
... storing 'diagnosis_recurrence' as categorical
... storing 'disease_extent' as categorical
... storing 'sample_primary_met' as categorical
... storing 'site' as categorical
... storing 'histology' as categorical
... storing 'genetic_hormonal_features' as categorical
... storing 'KI67' as categorical
... storing 'treated_naive' as categorical
... storing 'chemotherapy_exposed' as categorical
... storing 'targeted_rx_exposed' as categorical
... storing 'targeted_rx_response' as categorical
... storing 'ICB_exposed' as categorical
... storing 'ICB_response' as categorical
... storing 'ET_exposed' as 

In [31]:
import pandas as pd
import scanpy as sc
from scipy.io import mmread
import os

base_path = "/home/ubuntu/Downloads/Data_Sarcoma/Data_Jerby-Arnon2021_Sarcoma"

# Load 10X data
path_10x = os.path.join(base_path, "10X")
X_10x = mmread(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T.tocsr()
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x = sc.AnnData(X_10x)
adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# Load SmartSeq2 data
path_ss2 = os.path.join(base_path, "SmartSeq2")
X_ss2 = mmread(os.path.join(path_ss2, "Exp_data_TPM.mtx")).T.tocsr()
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2 = sc.AnnData(X_ss2)
adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'SmartSeq2'

# Align genes by intersection
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)

adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# Concatenate without changing obs_names
adata_combined = adata_10x.concatenate(
    adata_ss2, 
    batch_key='batch', 
    batch_categories=['10X', 'SmartSeq2'], 
    index_unique=None
)

# Check obs_names are unique
assert adata_combined.obs_names.is_unique

# Merge sample-level metadata
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv"))
samples_df = samples_df.drop_duplicates(subset=['sample'])

# Reset index, merge, and restore index
adata_combined.obs = adata_combined.obs.reset_index()
index_col = adata_combined.obs.columns[0]
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')

# Confirm no row number changes
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch!"

adata_combined.obs = adata_combined.obs.set_index(index_col)
adata_combined.obs.index.name = None


In [39]:
adata_combined.obs["technology"]

cell_name
SS12pt.10x.P1_AAACCTGTCACCTTAT_1          10x
SS12pt.10x.P1_AAACCTGTCAGTCAGT_1          10x
SS12pt.10x.P1_AAACCTGTCCAAAGTC_1          10x
SS12pt.10x.P1_AAACCTGTCCGTCATC_1          10x
SS12pt.10x.P1_AAACCTGTCTATGTGG_1          10x
                                      ...    
SS7posP2_H02                        SmartSeq2
SS7posP2_H03                        SmartSeq2
SS7posP2_H04                        SmartSeq2
SS7posP2_H10                        SmartSeq2
SS7posP2_H11                        SmartSeq2
Name: technology, Length: 16125, dtype: object

In [41]:
# Save the combined AnnData object to an h5ad file
adata_combined.write("Jerby-Arnon2021_Sarcoma.h5ad")

#### Hematologic

In [44]:
import os
import pandas as pd
import scanpy as sc
from scipy.io import mmread

# List of dataset folder names
dataset_folders = [
    "Data_Caron2020_Hematologic",
    "Data_Cohen2021_Hematologic",
    "Data_Galen2019_Hematologic",
    "Data_Gaydosik2019_Hematologic",
    "Data_Ledergor2018_Hematologic",
    "Data_Liu2021_Hematologic",
    "Data_Rendeiro2020_Hematologic",
    "Data_Riether2020_Hematologic",
    "Data_Roider2020_Hematologic",
    "Data_Steen2021_Hematologic",
    "Data_Zhang2019_Hematologic"
]

# Base directory containing all datasets
base_dir = "/home/ubuntu/Downloads/Data_Hematologic"

for folder in dataset_folders:
    try:
        print(f"\nProcessing {folder}...")
        base_path = os.path.join(base_dir, folder)

        # Load expression matrix and transpose it
        X = mmread(os.path.join(base_path, "Exp_data_UMIcounts.mtx")).T.tocsr()

        # Load gene names
        genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)[0].tolist()

        # Load cell metadata
        cells_df = pd.read_csv(os.path.join(base_path, "Cells.csv"), index_col=0)

        # Load sample metadata
        samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv"))

        # Create AnnData object
        adata = sc.AnnData(X)
        adata.var_names = genes
        adata.obs = cells_df

        # Merge sample-level metadata using the 'sample' column
        adata.obs = adata.obs.merge(samples_df, how="left", on="sample")

        # Save to .h5ad file
        output_path = os.path.join(base_path, f"{folder}.h5ad")
        adata.write(output_path)
        print(f"Saved: {output_path}")

    except Exception as e:
        print(f"❌ Error processing {folder}: {e}")



Processing Data_Caron2020_Hematologic...


... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'source' as categorical
... storing 'technology' as categorical
... storing 'patient' as categorical
... storing 'cancer_type' as categorical
... storing 'sample_type' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Caron2020_Hematologic/Data_Caron2020_Hematologic.h5ad

Processing Data_Cohen2021_Hematologic...


... storing 'sample' as categorical
... storing 'patient_x' as categorical
... storing 'cell_type' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'technology' as categorical
... storing 'patient_y' as categorical
... storing 'cancer_type' as categorical
... storing 'sample_primary_met' as categorical
... storing 'site' as categorical
... storing 'histology' as categorical
... storing 'treated_naive' as categorical
... storing 'chemotherapy_exposed' as categorical
... storing 'chemotherapy_response' as categorical
... storing 'targeted_rx_exposed' as categorical
... storing 'targeted_rx_response' as categorical
... storing 'ICB_exposed' as categorical
... storing 'ICB_response' as categorical
... storing 'ET_exposed' as categorical
... storing 'ET_response' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Cohen2021_Hematologic/Data_Cohen2021_Hematologic.h5ad

Processing Data_Galen2019_Hematologic...


... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'cell_subtype' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'source' as categorical
... storing 'technology' as categorical
... storing 'patient' as categorical
... storing 'cancer_type' as categorical
... storing 'sex' as categorical
... storing 'diagnosis_recurrence' as categorical
... storing 'site' as categorical
... storing 'genetic_hormonal_features' as categorical
... storing 'treated_naive' as categorical
... storing 'chemotherapy_exposed' as categorical
... storing 'chemotherapy_response' as categorical
... storing 'targeted_rx_exposed' as categorical
... storing 'targeted_rx_response' as categorical
... storing 'ICB_exposed' as categorical
... storing 'ICB_response' as categorical
... storing 'ET_exposed' as categorical
... storing 'ET_response' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Galen2019_Hematologic/Data_Galen2019_Hematologic.h5ad

Processing Data_Gaydosik2019_Hematologic...


... storing 'sample' as categorical
... storing 'patient_x' as categorical
... storing 'source_x' as categorical
... storing 'cell_type' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'patient_y' as categorical
... storing 'technology' as categorical
... storing 'source_y' as categorical
... storing 'cancer_type' as categorical
... storing 'sex' as categorical
... storing 'stage' as categorical
... storing 'tnm_stage' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Gaydosik2019_Hematologic/Data_Gaydosik2019_Hematologic.h5ad

Processing Data_Ledergor2018_Hematologic...


... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'batch' as categorical
... storing 'cluster' as categorical
... storing 'cancer_type' as categorical
... storing 'technology' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Ledergor2018_Hematologic/Data_Ledergor2018_Hematologic.h5ad

Processing Data_Liu2021_Hematologic...


... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'technology' as categorical
... storing 'patient' as categorical
... storing 'cancer_type' as categorical
... storing 'diagnosis_recurrence' as categorical
... storing 'AJCC_stage' as categorical
... storing 'sample_primary_met' as categorical
... storing 'size' as categorical
... storing 'site' as categorical
... storing 'histology' as categorical
... storing 'treated_naive' as categorical
... storing 'chemotherapy_exposed' as categorical
... storing 'chemotherapy_response' as categorical
... storing 'targeted_rx_exposed' as categorical
... storing 'targeted_rx_response' as categorical
... storing 'ICB_exposed' as categorical
... storing 'ICB_response' as categorical
... storing 'ET_exposed' as categorical
... storing 'ET_response' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Liu2021_Hematologic/Data_Liu2021_Hematologic.h5ad

Processing Data_Rendeiro2020_Hematologic...


... storing 'sample' as categorical
... storing 'patient_x' as categorical
... storing 'timepoint_x' as categorical
... storing 'cell_type' as categorical
... storing 'patient_y' as categorical
... storing 'timepoint_y' as categorical
... storing 'technology' as categorical
... storing 'cancer_type' as categorical
... storing 'sample_type' as categorical
... storing 'source' as categorical
... storing 'sex' as categorical
... storing 'prior_treatment' as categorical
... storing 'on_treatment' as categorical
... storing 'response' as categorical
... storing 'cytogenetics_pre_ibrutinib' as categorical
... storing 'p53_mutation' as categorical
... storing 'time_diagnosis_to_ibrutinib' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Rendeiro2020_Hematologic/Data_Rendeiro2020_Hematologic.h5ad

Processing Data_Riether2020_Hematologic...


... storing 'sample' as categorical
... storing 'patient_x' as categorical
... storing 'timepoint_x' as categorical
... storing 'cell_type' as categorical
... storing 'patient_y' as categorical
... storing 'timepoint_y' as categorical
... storing 'treatment' as categorical
... storing 'technology' as categorical
... storing 'cancer_type' as categorical
... storing 'sample_type' as categorical
... storing 'sex' as categorical
... storing 'cytogenetics' as categorical
... storing 'mutations' as categorical
... storing 'immunophenotype' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Riether2020_Hematologic/Data_Riether2020_Hematologic.h5ad

Processing Data_Roider2020_Hematologic...


  cells_df = pd.read_csv(os.path.join(base_path, "Cells.csv"), index_col=0)
... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'cell_subtype' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'disease' as categorical
... storing 'disease_subtype_x' as categorical
... storing 'source_x' as categorical
... storing 'cancer_type' as categorical
... storing 'technology' as categorical
... storing 'disease_subtype_y' as categorical
... storing 'source_y' as categorical
... storing 'histology' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Roider2020_Hematologic/Data_Roider2020_Hematologic.h5ad

Processing Data_Steen2021_Hematologic...


... storing 'sample' as categorical
... storing 'patient_x' as categorical
... storing 'cell_type' as categorical
... storing 'cell_subtype' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'disease' as categorical
... storing 'disease_subtype_x' as categorical
... storing 'cancer_type' as categorical
... storing 'technology' as categorical
... storing 'patient_y' as categorical
... storing 'disease_subtype_y' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Steen2021_Hematologic/Data_Steen2021_Hematologic.h5ad

Processing Data_Zhang2019_Hematologic...


... storing 'sample' as categorical
... storing 'patient_x' as categorical
... storing 'cell_type' as categorical
... storing 'cell_subtype' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'progression_status_x' as categorical
... storing 'cancer_type' as categorical
... storing 'technology' as categorical
... storing 'patient_y' as categorical
... storing 'progression_status_y' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Zhang2019_Hematologic/Data_Zhang2019_Hematologic.h5ad


In [45]:
import os
import pandas as pd
import scanpy as sc
from scipy.io import mmread

# List of datasets that use TPM expression data
tpm_datasets = [
    "Data_Gaiti2019_Hematologic",
    "Data_Giustacchini2017_Hematologic"
]

# Base directory containing all datasets
base_dir = "/home/ubuntu/Downloads/Data_Hematologic"

for folder in tpm_datasets:
    try:
        print(f"\nProcessing {folder}...")
        base_path = os.path.join(base_dir, folder)

        # Load TPM matrix and transpose it
        X = mmread(os.path.join(base_path, "Exp_data_TPM.mtx")).T.tocsr()

        # Load gene names
        genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)[0].tolist()

        # Load cell metadata
        cells_df = pd.read_csv(os.path.join(base_path, "Cells.csv"), index_col=0)

        # Load sample metadata
        samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv"))

        # Create AnnData object
        adata = sc.AnnData(X)
        adata.var_names = genes
        adata.obs = cells_df

        # Merge sample-level metadata using the 'sample' column
        adata.obs = adata.obs.merge(samples_df, how="left", on="sample")

        # Save to .h5ad file
        output_path = os.path.join(base_path, f"{folder}.h5ad")
        adata.write(output_path)
        print(f"Saved: {output_path}")

    except Exception as e:
        print(f"❌ Error processing {folder}: {e}")



Processing Data_Gaiti2019_Hematologic...


... storing 'sample' as categorical
... storing 'patient_x' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'sample_status' as categorical
... storing 'patient_y' as categorical
... storing 'technology' as categorical
... storing 'cancer_type' as categorical
... storing 'site' as categorical
... storing 'histology' as categorical
... storing 'genetic_hormonal_features' as categorical
... storing 'treated_naive' as categorical
... storing 'chemotherapy_exposed' as categorical
... storing 'chemotherapy_response' as categorical
... storing 'targeted_rx_exposed' as categorical
... storing 'ICB_exposed' as categorical
... storing 'ICB_response' as categorical
... storing 'ET_exposed' as categorical
... storing 'ET_response' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Gaiti2019_Hematologic/Data_Gaiti2019_Hematologic.h5ad

Processing Data_Giustacchini2017_Hematologic...


... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'cell_cycle_phase' as categorical
... storing 'mp_top' as categorical
... storing 'mp_assignment' as categorical
... storing 'batch' as categorical
... storing 'bcr_abl_status' as categorical
... storing 'stage_1' as categorical
... storing 'stage_2' as categorical
... storing 'responder_status' as categorical
... storing 'technology' as categorical
... storing 'patient' as categorical
... storing 'cancer_type' as categorical
... storing 'diagnosis_recurrence' as categorical
... storing 'disease_extent' as categorical
... storing 'sample_primary_met' as categorical
... storing 'site' as categorical
... storing 'genetic_hormonal_features' as categorical
... storing 'treated_naive' as categorical
... storing 'targeted_rx_exposed' as categorical
... storing 'targeted_rx_response' as categorical


Saved: /home/ubuntu/Downloads/Data_Hematologic/Data_Giustacchini2017_Hematologic/Data_Giustacchini2017_Hematologic.h5ad


In [78]:
import scanpy as sc
import numpy as np

# Step 1: Ensure unique gene names
adata_group1.var_names_make_unique()
adata_group2.var_names_make_unique()

# Step 2: Find common genes between both datasets
common_genes = sorted(set(adata_group1.var_names).intersection(adata_group2.var_names))
print(f"Number of common genes: {len(common_genes)}")

# Step 3: Subset both datasets to common genes
adata_group1 = adata_group1[:, common_genes].copy()
adata_group2 = adata_group2[:, common_genes].copy()

# Optional: check shapes
print(f"Group 1 shape after filtering: {adata_group1.shape}")
print(f"Group 2 shape after filtering: {adata_group2.shape}")

# Step 4: Concatenate (if needed)
# adata_combined = adata_group1.concatenate(adata_group2, batch_key="group", batch_categories=["group1", "group2"])


Number of common genes: 30316
Group 1 shape after filtering: (14894, 30316)
Group 2 shape after filtering: (16809, 30316)


In [79]:
# Concatenate with group labels
adata_combined = adata_group1.concatenate(
    adata_group2,
    batch_key="group",  # Column name in .obs indicating origin
    batch_categories=["group1", "group2"]
)

# Optional: inspect combined data
print(adata_combined)
print(adata_combined.obs["group"].value_counts())


AnnData object with n_obs × n_vars = 31703 × 30316
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'genotype', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS', 'group'
group
group2    16809
group1    14894
Name: count, dtype: int64


In [85]:
# Save the combined dataset to an HDF5 file
adata_combined.write("Data_Nam2019_Hematologic.h5ad")
