In [50]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc

## Sarcoma

#### 1.Jerby-Arnon2021

In [23]:

# Set base path
base_path = "/home/ubuntu/Downloads/Data_Sarcoma/Data_Jerby-Arnon2021_Sarcoma"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# === Load SmartSeq2 Data ===
path_ss2 = os.path.join(base_path, "SmartSeq2")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_UMIcounts.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'SmartSeq2'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'SmartSeq2'],
    index_unique=None
)

# === Check uniqueness ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve and reset index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge with sample metadata
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None

# === Save final object ===
output_path = os.path.join(base_path, "Data_Jerby-Arnon2021_Sarcoma.h5ad")
adata_combined.write(output_path)
print(f"✅ AnnData object saved to: {output_path}")


  adata_combined = adata_10x.concatenate(


✅ AnnData object saved to: /home/ubuntu/Downloads/Data_Sarcoma/Data_Jerby-Arnon2021_Sarcoma/Data_Jerby-Arnon2021_Sarcoma.h5ad


In [28]:
adata

AnnData object with n_obs × n_vars = 16125 × 9631
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'technology_x', 'cell_subtype', 'batch', 'technology_y', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [30]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [31]:
adata

AnnData object with n_obs × n_vars = 16125 × 9631
    obs: 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology_x', 'cell_subtype', 'batch', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [33]:
del adata.obs['batch']

In [37]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [40]:
adata.obs['study'] = 'Jerby-Arnon2021_Sarcoma'

In [41]:
adata.obs['category'] = 'Sarcoma'

In [45]:
adata.obs['source'] = 'NaN'

In [43]:
adata.obs['cell_name'] = adata.obs_names

In [46]:
adata.obs

Unnamed: 0,sample,cell_type,complexity,cell_cycle_phase,technology,cell_subtype,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category,cell_name,source
SS12pt.10x.P1_AAACCTGTCACCTTAT_1,SyS12pt,Malignant,3836,Not cycling,10X,,2399,P12,Synovial Sarcoma,M,24,,Primary,Chest wall,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS12pt.10x.P1_AAACCTGTCACCTTAT_1,
SS12pt.10x.P1_AAACCTGTCAGTCAGT_1,SyS12pt,Malignant,1829,Not cycling,10X,,2399,P12,Synovial Sarcoma,M,24,,Primary,Chest wall,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS12pt.10x.P1_AAACCTGTCAGTCAGT_1,
SS12pt.10x.P1_AAACCTGTCCAAAGTC_1,SyS12pt,Malignant,2640,Not cycling,10X,,2399,P12,Synovial Sarcoma,M,24,,Primary,Chest wall,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS12pt.10x.P1_AAACCTGTCCAAAGTC_1,
SS12pt.10x.P1_AAACCTGTCCGTCATC_1,SyS12pt,Malignant,1641,Not cycling,10X,,2399,P12,Synovial Sarcoma,M,24,,Primary,Chest wall,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS12pt.10x.P1_AAACCTGTCCGTCATC_1,
SS12pt.10x.P1_AAACCTGTCTATGTGG_1,SyS12pt,Malignant,1710,Not cycling,10X,,2399,P12,Synovial Sarcoma,M,24,,Primary,Chest wall,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS12pt.10x.P1_AAACCTGTCTATGTGG_1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SS7posP2_H02,SyS7,B_cell,2288,Not cycling,SmartSeq2,CD45+,686,P7,Synovial Sarcoma,M,45,,Primary,Para-aortic,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS7posP2_H02,
SS7posP2_H03,SyS7,Macrophage,2781,Not cycling,SmartSeq2,CD45+,686,P7,Synovial Sarcoma,M,45,,Primary,Para-aortic,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS7posP2_H03,
SS7posP2_H04,SyS7,NK_cell,3205,Not cycling,SmartSeq2,CD45+,686,P7,Synovial Sarcoma,M,45,,Primary,Para-aortic,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS7posP2_H04,
SS7posP2_H10,SyS7,Macrophage,3462,Not cycling,SmartSeq2,CD45+,686,P7,Synovial Sarcoma,M,45,,Primary,Para-aortic,TREATED,Jerby-Arnon2021_Sarcoma,Sarcoma,SS7posP2_H10,


In [47]:
output_path = "/home/ubuntu/Downloads/Data_Sarcoma/Data_Jerby-Arnon2021_Sarcoma.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Sarcoma/Data_Jerby-Arnon2021_Sarcoma.h5ad


#### 2.Zhou2020

In [51]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Sarcoma/Data_Zhou2020_Sarcoma/"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [52]:
adata

AnnData object with n_obs × n_vars = 64557 × 32864
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'source', 'malignant', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [53]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [57]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,source,malignant,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category
0,BC2_AAACGGGCAAAGGTGC_1,BC2,Osteoblast,1125,Not cycling,Insitu,yes,10x,866,BC2,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
1,BC2_AAACGGGGTGTTCGAT_2,BC2,Endothelial,2297,Not cycling,Insitu,no,10x,866,BC2,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
2,BC2_AAACGGGTCCGATATG_3,BC2,Osteoclast,1615,Not cycling,Insitu,,10x,866,BC2,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
3,BC2_AAACGGGTCGTGACAT_4,BC2,Osteoblast,1245,Not cycling,Insitu,yes,10x,866,BC2,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
4,BC2_AAAGATGCAAACGCGA_5,BC2,Fibroblast,1328,Not cycling,Insitu,no,10x,866,BC2,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64552,BC22_TTTGTTGGTATTTCCT_64553,BC22,Osteoblast,4669,G1/S,Insitu,yes,10x,5020,BC22,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
64553,BC22_TTTGTTGGTTGATCGT_64554,BC22,Osteoblast_proli,4674,G1/S,Insitu,yes,10x,5020,BC22,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
64554,BC22_TTTGTTGTCCGAAATC_64555,BC22,MSC,4128,Not cycling,Insitu,no,10x,5020,BC22,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma
64555,BC22_TTTGTTGTCGTGGTAT_64556,BC22,MSC,2594,Not cycling,Insitu,no,10x,5020,BC22,Osteosarcoma,,,,primary,Femur,treated,Data_Zhou2020_Sarcoma,Sarcoma


In [55]:
adata.obs['study'] = 'Data_Zhou2020_Sarcoma'

In [56]:
adata.obs['category'] = 'Sarcoma'

In [58]:
adata.obs['cell_subtype'] = 'NaN'

In [59]:
output_path = "/home/ubuntu/Downloads/Data_Sarcoma/Data_Zhou2020_Sarcoma.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Sarcoma/Data_Zhou2020_Sarcoma.h5ad


#### Data Merging

In [61]:
import scanpy as sc
import anndata
import os

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Sarcoma/Data_Jerby-Arnon2021_Sarcoma.h5ad",
    "/home/ubuntu/Downloads/Data_Sarcoma/Data_Zhou2020_Sarcoma.h5ad",
]

# Load datasets
adatas = [sc.read(file) for file in files]

# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Sarcoma/Sarcoma_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  concat_annot = pd.concat(


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Sarcoma/Sarcoma_Combined.h5ad
