In [1]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc

## Brain

#### 1. Choudhury2022_Brain

In [2]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Choudhury2022_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect



<function gc.collect(generation=2)>

In [8]:
adata

AnnData object with n_obs × n_vars = 58843 × 33538
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source_x', 'cancer_type', 'technology', 'n_cells', 'age'

In [7]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'patient_y', 'source_y',         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [6]:
adata.obs['source_y'].value_counts()

source_y
Meningioma                          35405
Meningioma brain-tumor interface    21739
Dura                                 1699
Name: count, dtype: int64

In [9]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [10]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [20]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,cancer_type,technology,n_cells,age,disease_extent,sex,sample_primary_met,treated_naive,site,study,category
0,MSC1_AAACCCACAATATCCG,MSC1,1,Malignant,Meningioma cells,952,,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
1,MSC1_AAACCCACATCTTAGG,MSC1,1,Malignant,Meningioma cells,940,,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
2,MSC1_AAACGAACAACCTATG,MSC1,1,Malignant,G1 phase meningioma cells,7134,Not cycling,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
3,MSC1_AAACGAACACCGCTAG,MSC1,1,Malignant,ECM remodeling meningioma cells,3521,Not cycling,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
4,MSC1_AAACGCTCATCTCATT,MSC1,1,Monocyte,CD163 monocytes,1855,Not cycling,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58838,MSC6-BTI_TTTGTTGTCAAGGTGG,MSC6-BTI,6,Malignant,Cycling G2M phase meningioma cells 1,899,,Meningioma brain-tumor interface,Meningioma,10x,13171,71.0,,,,,brain,Choudhury2022_Brain,Brain
58839,MSC6-BTI_TTTGTTGTCGACATAC,MSC6-BTI,6,Malignant,Cycling G2M phase meningioma cells 1,4827,G2/M,Meningioma brain-tumor interface,Meningioma,10x,13171,71.0,,,,,brain,Choudhury2022_Brain,Brain
58840,MSC6-BTI_TTTGTTGTCTCAATCT,MSC6-BTI,6,Monocyte,Active monocytes,2009,Not cycling,Meningioma brain-tumor interface,Meningioma,10x,13171,71.0,,,,,brain,Choudhury2022_Brain,Brain
58841,MSC6-BTI_TTTGTTGTCTGCTTAT,MSC6-BTI,6,Malignant,Cycling S phase meningioma cells,1881,Not cycling,Meningioma brain-tumor interface,Meningioma,10x,13171,71.0,,,,,brain,Choudhury2022_Brain,Brain


In [12]:
adata.obs['disease_extent'] = 'NaN'

In [13]:
adata.obs['sex'] = 'NaN'

In [14]:
adata.obs['sample_primary_met'] = 'NaN'

In [15]:
adata.obs['treated_naive'] = 'NaN'

In [16]:
adata.obs['site'] = 'brain'

In [17]:
adata.obs['study'] = 'Choudhury2022_Brain'

In [18]:
adata.obs['category'] = 'Brain'

In [19]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Choudhury2022_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Choudhury2022_Brain.h5ad


#### 2.Couturier2020_Brain

In [21]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Couturier2020_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [24]:
adata

AnnData object with n_obs × n_vars = 100335 × 33694
    obs: 'cell_name', 'sample', 'cell_type', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [23]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [25]:
adata.obs['treated_naive'].value_counts()

treated_naive
naive    64012
Name: count, dtype: int64

In [38]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,complexity,category,source,study
0,AAACCTGAGAAGGTTT-1,BT322,,,10x,3066,BT322,Glioblastoma,,,local,primary,Right temporal,naive,,,Brain,,Couturier2020_Brain
1,AAACCTGAGAGACTTA-1,HFA571_cd133,,,10x,7716,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
2,AAACCTGAGCGTTGCC-1,BT322,,,10x,3066,BT322,Glioblastoma,,,local,primary,Right temporal,naive,,,Brain,,Couturier2020_Brain
3,AAACCTGCAACACCTA-1,BT363_1of2,Malignant,Not cycling,10x,3882,BT363,Glioblastoma,,,local,primary,Left frontal,naive,,,Brain,,Couturier2020_Brain
4,AAACCTGGTAAGTGGC-1,BT322,,,10x,3066,BT322,Glioblastoma,,,local,primary,Right temporal,naive,,,Brain,,Couturier2020_Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100330,TTTGGTTCACAAGACG-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
100331,TTTGGTTCAGCTCGCA-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
100332,TTTGTCAAGCTAGCCC-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
100333,TTTGTCACATGAAGTA-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain


In [31]:
adata.obs['cell_subtype'] = 'NaN'

In [32]:
adata.obs['complexity'] = 'NaN'

In [34]:
adata.obs['source'] = 'NaN'

In [33]:
adata.obs['category'] = 'Brain'

In [35]:
adata.obs['study'] = 'Couturier2020_Brain'

In [39]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,complexity,category,source,study
0,AAACCTGAGAAGGTTT-1,BT322,,,10x,3066,BT322,Glioblastoma,,,local,primary,Right temporal,naive,,,Brain,,Couturier2020_Brain
1,AAACCTGAGAGACTTA-1,HFA571_cd133,,,10x,7716,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
2,AAACCTGAGCGTTGCC-1,BT322,,,10x,3066,BT322,Glioblastoma,,,local,primary,Right temporal,naive,,,Brain,,Couturier2020_Brain
3,AAACCTGCAACACCTA-1,BT363_1of2,Malignant,Not cycling,10x,3882,BT363,Glioblastoma,,,local,primary,Left frontal,naive,,,Brain,,Couturier2020_Brain
4,AAACCTGGTAAGTGGC-1,BT322,,,10x,3066,BT322,Glioblastoma,,,local,primary,Right temporal,naive,,,Brain,,Couturier2020_Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100330,TTTGGTTCACAAGACG-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
100331,TTTGGTTCAGCTCGCA-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
100332,TTTGTCAAGCTAGCCC-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain
100333,TTTGTCACATGAAGTA-1,NSC1_cd133,,,10x,1717,,Normal,,,,,,,,,Brain,,Couturier2020_Brain


In [37]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Couturier2020_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Couturier2020_Brain.h5ad


#### 3. Darmanis2017_Brain

In [40]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Darmanis2017_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "normalized_Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [44]:
adata

AnnData object with n_obs × n_vars = 3589 × 23368
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'source', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [42]:
adata.obs['source'].value_counts()

source
Tumor        2343
Periphery    1189
Distant        57
Name: count, dtype: int64

In [43]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'cell_QCpass', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [51]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,category,study
0,1001000173.G8,BT_S2,Malignant,2145,Not cycling,Tumor,SmartSeq2,1169,BT_S2,Glioblastoma,,54,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
1,1001000173.D4,BT_S2,Oligodendrocyte,1037,Not cycling,Tumor,SmartSeq2,1169,BT_S2,Glioblastoma,,54,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
2,1001000173.B4,BT_S2,Malignant,5829,Not cycling,Tumor,SmartSeq2,1169,BT_S2,Glioblastoma,,54,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
3,1001000173.A2,BT_S2,Malignant,4066,Not cycling,Tumor,SmartSeq2,1169,BT_S2,Glioblastoma,,54,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
4,1001000173.E2,BT_S2,Malignant,2310,Not cycling,Tumor,SmartSeq2,1169,BT_S2,Glioblastoma,,54,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3584,1001000271.F1,BT_S6,Vascular,681,,Tumor,SmartSeq2,389,BT_S6,Glioblastoma,,48,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
3585,1001000271.D4,BT_S6,Myeloid,2247,Not cycling,Tumor,SmartSeq2,389,BT_S6,Glioblastoma,,48,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
3586,1001000271.C1,BT_S6,Vascular,4099,Not cycling,Tumor,SmartSeq2,389,BT_S6,Glioblastoma,,48,local,primary,brain,,,Brain,Data_Darmanis2017_Brain
3587,1001000271.H7,BT_S6,Vascular,700,,Tumor,SmartSeq2,389,BT_S6,Glioblastoma,,48,local,primary,brain,,,Brain,Data_Darmanis2017_Brain


In [47]:
adata.obs['cell_subtype'] = 'NaN'

In [49]:
adata.obs['category'] = 'Brain'

In [50]:
adata.obs['study'] = 'Data_Darmanis2017_Brain'

In [52]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Darmanis2017_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Darmanis2017_Brain.h5ad


#### 4.Filbin2018_Brain

In [53]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Filbin2018_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [57]:
adata

AnnData object with n_obs × n_vars = 2587 × 23686
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [55]:
adata.obs['opc_variable'].value_counts()

opc_variable
-0.573399    1
-1.018730    1
-0.938586    1
-0.870725    1
-0.444017    1
            ..
-0.815809    1
-0.759807    1
-1.217047    1
-1.463382    1
-1.050888    1
Name: count, Length: 2259, dtype: int64

In [56]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [58]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,BCH1126-P01-A01,BCH1126,Malignant,4064,Not cycling,SmartSeq2,298,BCH1126,H3K27M Glioma,F,10.0,local,primary,Pons,naive
1,BCH1126-P01-A02,BCH1126,Malignant,5162,Not cycling,SmartSeq2,298,BCH1126,H3K27M Glioma,F,10.0,local,primary,Pons,naive
2,BCH1126-P01-A04,BCH1126,Malignant,3583,Not cycling,SmartSeq2,298,BCH1126,H3K27M Glioma,F,10.0,local,primary,Pons,naive
3,BCH1126-P01-A07,BCH1126,Malignant,4743,Not cycling,SmartSeq2,298,BCH1126,H3K27M Glioma,F,10.0,local,primary,Pons,naive
4,BCH1126-P01-A08,BCH1126,Malignant,4015,Not cycling,SmartSeq2,298,BCH1126,H3K27M Glioma,F,10.0,local,primary,Pons,naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2582,Oligo-P22-H03,Oligo,Oligodendrocyte,5547,Not cycling,SmartSeq2,138,,Normal,,,,,,
2583,Oligo-P22-H05,Oligo,Oligodendrocyte,2991,Not cycling,SmartSeq2,138,,Normal,,,,,,
2584,Oligo-P22-H06,Oligo,Oligodendrocyte,2977,Not cycling,SmartSeq2,138,,Normal,,,,,,
2585,Oligo-P22-H08,Oligo,Oligodendrocyte,2885,Not cycling,SmartSeq2,138,,Normal,,,,,,


In [60]:
adata.obs['cell_subtype'] = 'NaN'

In [61]:
adata.obs['source'] = 'NaN'

In [62]:
adata.obs['study'] = 'Data_Filbin2018_Brain'

In [63]:
adata.obs['category'] = 'Brain'

In [64]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Filbin2018_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Filbin2018_Brain.h5ad


#### 5.Gojo2020_Brain

In [65]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Gojo2020_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [68]:
adata

AnnData object with n_obs × n_vars = 6739 × 20447
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [67]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [73]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,study,category
0,BT1030.P1.A01,BT1030,Malignant,ST-Midline,4734,Not cycling,SmartSeq2,233,BT1030,Ependymoma,M,16.1,,primary,temporal/midline,naive,,Gojo2020_Brain,Brain
1,BT1030.P1.A03,BT1030,Malignant,ST-Midline,2988,Not cycling,SmartSeq2,233,BT1030,Ependymoma,M,16.1,,primary,temporal/midline,naive,,Gojo2020_Brain,Brain
2,BT1030.P1.A04,BT1030,Malignant,ST-Ependymal-like,4686,Not cycling,SmartSeq2,233,BT1030,Ependymoma,M,16.1,,primary,temporal/midline,naive,,Gojo2020_Brain,Brain
3,BT1030.P1.A06,BT1030,Malignant,ST-Ependymal-like,5845,Not cycling,SmartSeq2,233,BT1030,Ependymoma,M,16.1,,primary,temporal/midline,naive,,Gojo2020_Brain,Brain
4,BT1030.P1.A07,BT1030,,,3057,,SmartSeq2,233,BT1030,Ependymoma,M,16.1,,primary,temporal/midline,naive,,Gojo2020_Brain,Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6734,MUV043Nuc3.P1.H08,MUV043Nuc3,,,2879,,SmartSeq2,294,MUV043,Ependymoma,M,21.6,,primary,postcentral-central,naive,,Gojo2020_Brain,Brain
6735,MUV043Nuc3.P1.H09,MUV043Nuc3,,,3867,,SmartSeq2,294,MUV043,Ependymoma,M,21.6,,primary,postcentral-central,naive,,Gojo2020_Brain,Brain
6736,MUV043Nuc3.P1.H10,MUV043Nuc3,,,3519,,SmartSeq2,294,MUV043,Ependymoma,M,21.6,,primary,postcentral-central,naive,,Gojo2020_Brain,Brain
6737,MUV043Nuc3.P1.H11,MUV043Nuc3,,,3616,,SmartSeq2,294,MUV043,Ependymoma,M,21.6,,primary,postcentral-central,naive,,Gojo2020_Brain,Brain


In [70]:
adata.obs['source'] = 'NaN'

In [71]:
adata.obs['study'] = 'Gojo2020_Brain'

In [72]:
adata.obs['category'] = 'Brain'

In [74]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Gojo2020_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Gojo2020_Brain.h5ad


#### 6.Hovestadt2019_Brain

In [75]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Hovestadt2019_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [76]:
adata

AnnData object with n_obs × n_vars = 8691 × 23686
    obs: 'cell_name', 'sample', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'source', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [77]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [83]:
adata.obs

Unnamed: 0,cell_name,sample,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_type,cell_subtype,study,category
0,BCH807.P02.A01,BCH807,7777,Not cycling,Patient,SmartSeq2,307,BCH807,Medulloblastoma,M,12.0,metastatic,met,,naive,,,Hovestadt2019_Brain,Brain
1,BCH807.P02.A02,BCH807,6117,Not cycling,Patient,SmartSeq2,307,BCH807,Medulloblastoma,M,12.0,metastatic,met,,naive,,,Hovestadt2019_Brain,Brain
2,BCH807.P02.A03,BCH807,5717,Not cycling,Patient,SmartSeq2,307,BCH807,Medulloblastoma,M,12.0,metastatic,met,,naive,,,Hovestadt2019_Brain,Brain
3,BCH807.P02.A04,BCH807,6813,G2/M,Patient,SmartSeq2,307,BCH807,Medulloblastoma,M,12.0,metastatic,met,,naive,,,Hovestadt2019_Brain,Brain
4,BCH807.P02.A05,BCH807,6338,Not cycling,Patient,SmartSeq2,307,BCH807,Medulloblastoma,M,12.0,metastatic,met,,naive,,,Hovestadt2019_Brain,Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8686,RCMB24.P10.G12,RCMB24,4276,Not cycling,PDX,SmartSeq2,137,PDX,Medulloblastoma,M,12.0,,,,,,,Hovestadt2019_Brain,Brain
8687,RCMB24.P10.H02,RCMB24,2876,Not cycling,PDX,SmartSeq2,137,PDX,Medulloblastoma,M,12.0,,,,,,,Hovestadt2019_Brain,Brain
8688,RCMB24.P10.H09,RCMB24,2934,Not cycling,PDX,SmartSeq2,137,PDX,Medulloblastoma,M,12.0,,,,,,,Hovestadt2019_Brain,Brain
8689,RCMB24.P10.H10,RCMB24,2698,Not cycling,PDX,SmartSeq2,137,PDX,Medulloblastoma,M,12.0,,,,,,,Hovestadt2019_Brain,Brain


In [79]:
adata.obs['cell_type'] = 'NaN'

In [80]:
adata.obs['cell_subtype'] = 'NaN'

In [81]:
adata.obs['study'] = 'Hovestadt2019_Brain'

In [82]:
adata.obs['category'] = 'Brain'

In [84]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Hovestadt2019_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Hovestadt2019_Brain.h5ad


#### 7.Neftel2019_Brain

In [87]:
import os
import pandas as pd
import scanpy as sc
import gc

# === Set base path ===
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Neftel2019_Brain"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x.var_names = genes_10x
adata_10x.var_names_make_unique()
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# === Load SmartSeq2 Data ===
path_ss2 = os.path.join(base_path, "SmartSeq2")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_UMIcounts.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2.var_names = genes_ss2
adata_ss2.var_names_make_unique()
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'SmartSeq2'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'SmartSeq2'],
    index_unique=None
)

# === Check uniqueness ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve and reset index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge with sample metadata
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None

# === Save final object ===
#output_path = os.path.join(base_path, "Data_Neftel2019_Brain.h5ad")
#adata_combined.write(output_path)
#print(f"✅ AnnData object saved to: {output_path}")

# Optional cleanup
gc.collect()


  adata_combined = adata_10x.concatenate(


3568

In [89]:
adata = adata_combined

In [97]:
adata

AnnData object with n_obs × n_vars = 24131 × 19135
    obs: 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology_x', 'malignant', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [96]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'age_group', 'cross_section', 'genes_expressed', 'MESlike2',
            'MESlike1', 'AClike', 'OPClike', 'NPClike1', 'NPClike2', 'batch', 'technology_y', 
            'smoking_status', 'PY', 
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [110]:
adata.obs['malignant'].value_counts()

malignant
yes    6855
no      776
Name: count, dtype: int64

In [149]:
adata.obs

Unnamed: 0,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_name,cell_subtype,study,category,source
102_1,102,Malignant,3101,Not cycling,10X,1822,102,Glioblastoma,M,65,local,primary,Left temporal,naive,102_1,,Neftel2019_Brain,Brain,
102_2,102,Malignant,2366,G1/S,10X,1822,102,Glioblastoma,M,65,local,primary,Left temporal,naive,102_2,,Neftel2019_Brain,Brain,
102_4,102,Malignant,1217,Not cycling,10X,1822,102,Glioblastoma,M,65,local,primary,Left temporal,naive,102_4,,Neftel2019_Brain,Brain,
102_5,102,Malignant,2285,Not cycling,10X,1822,102,Glioblastoma,M,65,local,primary,Left temporal,naive,102_5,,Neftel2019_Brain,Brain,
102_7,102,Malignant,1725,Not cycling,10X,1822,102,Glioblastoma,M,65,local,primary,Left temporal,naive,102_7,,Neftel2019_Brain,Brain,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MGH66-P08-H06,MGH66,Malignant,7432,G1/S,SmartSeq2,436,MGH66,Glioblastoma,M,67,local,primary,Left frontal,naive,MGH66-P08-H06,,Neftel2019_Brain,Brain,malignant
MGH66-P08-H07,MGH66,Malignant,5164,Not cycling,SmartSeq2,436,MGH66,Glioblastoma,M,67,local,primary,Left frontal,naive,MGH66-P08-H07,,Neftel2019_Brain,Brain,malignant
MGH66-P08-H08,MGH66,Malignant,6954,G1/S,SmartSeq2,436,MGH66,Glioblastoma,M,67,local,primary,Left frontal,naive,MGH66-P08-H08,,Neftel2019_Brain,Brain,malignant
MGH66-P08-H10,MGH66,Malignant,5014,Not cycling,SmartSeq2,436,MGH66,Glioblastoma,M,67,local,primary,Left frontal,naive,MGH66-P08-H10,,Neftel2019_Brain,Brain,malignant


In [99]:
adata.obs['cell_name'] = adata.obs_names

In [148]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [146]:
adata.obs['source'] = adata.obs['malignant'].map({
    'no': 'not malignant',
    'yes': 'malignant',
    'Indistinct': 'Indistinct'
})

In [147]:
del adata.obs['malignant']

In [101]:
adata.obs['cell_subtype'] = 'NaN'

In [102]:
adata.obs['study'] = 'Neftel2019_Brain'

In [103]:
adata.obs['category'] = 'Brain'

In [150]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Neftel2019_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Neftel2019_Brain.h5ad


#### 8.Tirosh2016_Brain

In [124]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Tirosh2016_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [137]:
adata

AnnData object with n_obs × n_vars = 4347 × 23686
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive', 'source'

In [136]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'Differentiation', 'Stemness', 
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [128]:
adata.obs['source'] = adata.obs['malignant'].map({
    'no': 'not malignant',
    'yes': 'malignant',
    'Indistinct': 'Indistinct'
})

In [134]:
del adata.obs['malignant']

In [129]:
adata.obs['source'].value_counts()

source
malignant        4044
not malignant     280
Name: count, dtype: int64

In [142]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,cell_subtype,study,category
0,MGH36_P6_A12,MGH36,Macrophage,4990,Not cycling,SmartSeq2,788,MGH36,Oligodendroglioma,Male,67,local,primary,Right frontotemporoinsular,naive,not malignant,,Tirosh2016_Brain,Brain
1,MGH36_P6_H09,MGH36,Macrophage,3031,Not cycling,SmartSeq2,788,MGH36,Oligodendroglioma,Male,67,local,primary,Right frontotemporoinsular,naive,not malignant,,Tirosh2016_Brain,Brain
2,MGH53_P4_G04,MGH53,Macrophage,3842,Not cycling,SmartSeq2,861,MGH53,Oligodendroglioma,Male,31,local,primary,Left frontal,naive,not malignant,,Tirosh2016_Brain,Brain
3,MGH36_P10_G12,MGH36,Macrophage,4312,Not cycling,SmartSeq2,788,MGH36,Oligodendroglioma,Male,67,local,primary,Right frontotemporoinsular,naive,not malignant,,Tirosh2016_Brain,Brain
4,MGH53_P2_H12,MGH53,Macrophage,5549,Not cycling,SmartSeq2,861,MGH53,Oligodendroglioma,Male,31,local,primary,Left frontal,naive,not malignant,,Tirosh2016_Brain,Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4342,93_P8_H06,MGH93,Malignant,5364,Not cycling,SmartSeq2,445,MGH93,Oligodendroglioma,Female,65,local,primary,Right temporal,naive,malignant,,Tirosh2016_Brain,Brain
4343,93_P9_C07,MGH93,Malignant,5550,Not cycling,SmartSeq2,445,MGH93,Oligodendroglioma,Female,65,local,primary,Right temporal,naive,malignant,,Tirosh2016_Brain,Brain
4344,93_P8_A12,MGH93,Malignant,5634,Not cycling,SmartSeq2,445,MGH93,Oligodendroglioma,Female,65,local,primary,Right temporal,naive,malignant,,Tirosh2016_Brain,Brain
4345,93_P8_C01,MGH93,Malignant,4641,Not cycling,SmartSeq2,445,MGH93,Oligodendroglioma,Female,65,local,primary,Right temporal,naive,malignant,,Tirosh2016_Brain,Brain


In [139]:
adata.obs['cell_subtype'] = 'NaN'

In [140]:
adata.obs['study'] = 'Tirosh2016_Brain'

In [141]:
adata.obs['category'] = 'Brain'

In [143]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Tirosh2016_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Tirosh2016_Brain.h5ad


#### 10. Venteicher2017_Brain

In [151]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Venteicher2017_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [154]:
adata

AnnData object with n_obs × n_vars = 6341 × 23686
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [153]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'Differentiation', 'Stemness', 
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [155]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,57-P14-A02,MGH57,Macrophage,3888,Not cycling,SmartSeq2,415,MGH57,Astrocytoma,female,47,local,primary,brain,naive
1,57-P14-A04,MGH57,Macrophage,2854,Not cycling,SmartSeq2,415,MGH57,Astrocytoma,female,47,local,primary,brain,naive
2,57-P14-A05,MGH57,Macrophage,3542,Not cycling,SmartSeq2,415,MGH57,Astrocytoma,female,47,local,primary,brain,naive
3,57-P14-A06,MGH57,Macrophage,3483,Not cycling,SmartSeq2,415,MGH57,Astrocytoma,female,47,local,primary,brain,naive
4,57-P14-A07,MGH57,,3845,,SmartSeq2,415,MGH57,Astrocytoma,female,47,local,primary,brain,naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6336,MGH64-P6-H05,MGH64,Malignant,5242,Not cycling,SmartSeq2,795,MGH64,Astrocytoma,female,47,local,primary,brain,naive
6337,MGH64-P6-H08,MGH64,Malignant,7069,Not cycling,SmartSeq2,795,MGH64,Astrocytoma,female,47,local,primary,brain,naive
6338,MGH64-P6-H09,MGH64,Malignant,3974,Not cycling,SmartSeq2,795,MGH64,Astrocytoma,female,47,local,primary,brain,naive
6339,MGH64-P6-H10,MGH64,,5925,,SmartSeq2,795,MGH64,Astrocytoma,female,47,local,primary,brain,naive


In [156]:
adata.obs['cell_subtype'] = 'NaN'

In [157]:
adata.obs['source'] = 'NaN'

In [158]:
adata.obs['study'] = 'Venteicher2017_Brain'

In [159]:
adata.obs['category'] = 'Brain'

In [160]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Venteicher2017_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Venteicher2017_Brain.h5ad


#### 10. Wang2019_Brain

In [161]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Wang2019_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [162]:
adata

AnnData object with n_obs × n_vars = 16317 × 33545
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [163]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'Differentiation', 'Stemness', 
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [175]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,source,study,category,sex
0,SF10022_AAACCCAAGAAGGTAG-1,SF10022,Malignant,2804,,10x,2888,SF10022,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
1,SF10022_AAACCCAAGACCTCCG-1,SF10022,Malignant,2365,,10x,2888,SF10022,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
2,SF10022_AAACCCACAACTTGGT-1,SF10022,Malignant,4567,,10x,2888,SF10022,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
3,SF10022_AAACCCAGTCTACGAT-1,SF10022,Oligodendrocyte,2680,,10x,2888,SF10022,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
4,SF10022_AAACCCAGTTATCTTC-1,SF10022,Malignant,4157,,10x,2888,SF10022,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16312,SF9259S_TTTCCTCTCCCTTGGT-1,SF9259S,Oligodendrocyte,3399,,10x,849,SF9259,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
16313,SF9259S_TTTGGAGTCAACACCA-1,SF9259S,Oligodendrocyte,1188,,10x,849,SF9259,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
16314,SF9259S_TTTGGTTGTCGTTATG-1,SF9259S,Oligodendrocyte,2190,,10x,849,SF9259,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,
16315,SF9259S_TTTGTTGAGCTCTTCC-1,SF9259S,Oligodendrocyte,3070,,10x,849,SF9259,Glioblastoma,,local,primary,Brain,naive,,,Wang2019_Brain,Brain,


In [166]:
adata.obs['sex'].value_counts()

sex
False    693
Name: count, dtype: int64

In [167]:
adata.obs['cell_subtype'] = 'NaN'

In [168]:
adata.obs['source'] = 'NaN'

In [169]:
adata.obs['study'] = 'Wang2019_Brain'

In [170]:
adata.obs['category'] = 'Brain'

In [173]:
del adata.obs['sex']

In [174]:
adata.obs['sex'] = 'NaN'

In [176]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Wang2019_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Wang2019_Brain.h5ad


#### 11.Yuan2018_Brain

In [177]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Brain/Data_Yuan2018_Brain"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

<function gc.collect(generation=2)>

In [183]:
adata

AnnData object with n_obs × n_vars = 23793 × 58828
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [182]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 'clusters', 'clusters_malignant_cells',
            'housekeeping_gene_expression', 'opc_variable', 'oc_like', 'ac_like', 'opc_like',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'Differentiation', 'Stemness', 
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [184]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,1,PJ016,Malignant,6837,Intermediate,Microwell array-based platform,3085,PJ016,,F,,local,primary,right frontal,naive
1,2,PJ016,Malignant,7378,Intermediate,Microwell array-based platform,3085,PJ016,,F,,local,primary,right frontal,naive
2,3,PJ016,Malignant,6846,Not cycling,Microwell array-based platform,3085,PJ016,,F,,local,primary,right frontal,naive
3,4,PJ016,Malignant,6931,Not cycling,Microwell array-based platform,3085,PJ016,,F,,local,primary,right frontal,naive
4,5,PJ016,Malignant,6807,Not cycling,Microwell array-based platform,3085,PJ016,,F,,local,primary,right frontal,naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23788,23789,PJ048,Malignant,361,,Microwell array-based platform,3084,PJ048,Glioblastoma,M,,local,primary,right parietal,naive
23789,23790,PJ048,Malignant,348,,Microwell array-based platform,3084,PJ048,Glioblastoma,M,,local,primary,right parietal,naive
23790,23791,PJ048,Malignant,413,,Microwell array-based platform,3084,PJ048,Glioblastoma,M,,local,primary,right parietal,naive
23791,23792,PJ048,Malignant,370,,Microwell array-based platform,3084,PJ048,Glioblastoma,M,,local,primary,right parietal,naive


In [185]:
adata.obs['cell_subtype'] = 'NaN'

In [186]:
adata.obs['source'] = 'NaN'

In [187]:
adata.obs['study'] = 'Yuan2018_Brain'

In [188]:
adata.obs['category'] = 'Brain'

In [189]:
output_path = "/home/ubuntu/Downloads/Data_Brain/Data_Yuan2018_Brain.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Brain/Data_Yuan2018_Brain.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [191]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Brain/Data_Choudhury2022_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Couturier2020_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Darmanis2017_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Filbin2018_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Gojo2020_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Hovestadt2019_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Neftel2019_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Tirosh2016_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Venteicher2017_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Wang2019_Brain.h5ad",
    "/home/ubuntu/Downloads/Data_Brain/Data_Yuan2018_Brain.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Brain/Brain_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  concat_annot = pd.concat(
  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Brain/Brain_Combined.h5ad


In [193]:
adata_merged.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,cancer_type,technology,n_cells,age,disease_extent,sex,sample_primary_met,treated_naive,site,study,category
0,MSC1_AAACCCACAATATCCG,MSC1,1,Malignant,Meningioma cells,952,,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
1,MSC1_AAACCCACATCTTAGG,MSC1,1,Malignant,Meningioma cells,940,,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
2,MSC1_AAACGAACAACCTATG,MSC1,1,Malignant,G1 phase meningioma cells,7134,Not cycling,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
3,MSC1_AAACGAACACCGCTAG,MSC1,1,Malignant,ECM remodeling meningioma cells,3521,Not cycling,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
4,MSC1_AAACGCTCATCTCATT,MSC1,1,Monocyte,CD163 monocytes,1855,Not cycling,Meningioma,Meningioma,10x,2538,44.3,,,,,brain,Choudhury2022_Brain,Brain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23788,23789,PJ048,PJ048,Malignant,,361,,,Glioblastoma,Microwell array-based platform,3084,,local,M,primary,naive,right parietal,Yuan2018_Brain,Brain
23789,23790,PJ048,PJ048,Malignant,,348,,,Glioblastoma,Microwell array-based platform,3084,,local,M,primary,naive,right parietal,Yuan2018_Brain,Brain
23790,23791,PJ048,PJ048,Malignant,,413,,,Glioblastoma,Microwell array-based platform,3084,,local,M,primary,naive,right parietal,Yuan2018_Brain,Brain
23791,23792,PJ048,PJ048,Malignant,,370,,,Glioblastoma,Microwell array-based platform,3084,,local,M,primary,naive,right parietal,Yuan2018_Brain,Brain
