In [1]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc

## Colorectal

#### 1. Chen2021_Colorectal

In [5]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Chen2021_Colorectal"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [3]:
adata

AnnData object with n_obs × n_vars = 57723 × 31412
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'source_x', 'HTAN.Parent.Data.File.ID', 'Polyp_Type_x', 'cancer_type', 'technology', 'n_cells', 'patient_y', 'source_y', 'Polyp_Type_y'

In [6]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'HTAN.Parent.Data.File.ID', 
            'patient_y', 'source_y', 'Polyp_Type_y'        
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [9]:
adata.obs['source_x'].value_counts()

source_x
normal            34008
adenoma           11138
serrated_polyp     8671
unconfirmed        3906
Name: count, dtype: int64

In [17]:
adata.obs

Unnamed: 0,cell_name,sample,patient_x,cell_type,cell_subtype,complexity,cell_cycle_phase,source_x,cancer_type,technology,n_cells,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category
0,CCAACCGTATATACCT_0_5607,HTA11_10167_2000001011,HTA11_10167,Epithelial,ABS,6713,Not cycling,serrated_polyp,Premalignant,modified inDrop platform,543,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
1,AATGGATTAAAACCTCC_0_5607,HTA11_10167_2000001011,HTA11_10167,Epithelial,ABS,5368,Not cycling,serrated_polyp,Premalignant,modified inDrop platform,543,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
2,TGAAGGAGCTTACCACGCT_0_5607,HTA11_10167_2000001011,HTA11_10167,Epithelial,ABS,5076,Not cycling,serrated_polyp,Premalignant,modified inDrop platform,543,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
3,TGATTCGCTGGATACCCAG_0_5607,HTA11_10167_2000001011,HTA11_10167,Serrated_specific_cells,SSC,5384,Not cycling,serrated_polyp,Premalignant,modified inDrop platform,543,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
4,TGATCGACGGTTTGGACTT_0_5607,HTA11_10167_2000001011,HTA11_10167,Epithelial,ABS,4578,Not cycling,serrated_polyp,Premalignant,modified inDrop platform,543,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57718,ACCTTGCCTTGATTCT_22_5898,HTA11_8504_2000002011,HTA11_8504,Epithelial,GOB,798,,normal,Normal,modified inDrop platform,511,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
57719,TGACAATACTTGCAAAGCC_22_5898,HTA11_8504_2000002011,HTA11_8504,Epithelial,ABS,773,,normal,Normal,modified inDrop platform,511,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
57720,TGATACGTGCTATCCGCTA_22_5898,HTA11_8504_2000002011,HTA11_8504,Epithelial,GOB,979,,normal,Normal,modified inDrop platform,511,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal
57721,TGACAAGTCATTAGTCGCA_22_5898,HTA11_8504_2000002011,HTA11_8504,Epithelial,ABS,850,,normal,Normal,modified inDrop platform,511,,,,,Colon polyps,,Data_Chen2021_Colorectal,Colorectal


In [18]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [19]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [8]:
adata.obs['sex'] = 'NaN'

In [9]:
adata.obs['age'] = 'NaN'

In [10]:
adata.obs['disease_extent'] = 'NaN'

In [11]:
adata.obs['sample_primary_met'] = 'NaN'

In [12]:
adata.obs['site'] = 'Colon polyps'

In [13]:
del adata.obs['Polyp_Type_x']

In [14]:
adata.obs['treated_naive'] = 'NaN'

In [15]:
adata.obs['study'] = 'Data_Chen2021_Colorectal'

In [16]:
adata.obs['category'] = 'Colorectal'

In [20]:
output_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Chen2021_Colorectal.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Colorectal/Data_Chen2021_Colorectal.h5ad


#### 2.Lee2020_Colorectal

In [21]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Lee2020_Colorectal"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [22]:
adata

AnnData object with n_obs × n_vars = 21657 × 22276
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [23]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [24]:
adata.obs['treated_naive'].value_counts()

treated_naive
naive    21657
Name: count, dtype: int64

In [37]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [39]:
adata.obs = adata.obs.rename(columns={"type_x": "source"})

In [41]:
adata.obs = adata.obs.rename(columns={"source_x": "site"})

In [43]:
adata.obs = adata.obs.rename(columns={"treatment_x": "treated_naive"})

In [31]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,category,source,study
0,SMC01-T_AAACCTGAGAAGGTTT,SMC01,T_cell,1400,Not cycling,10x,1742,SMC01,Colorectal Cancer,F,64,local,primary,rectum,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
1,SMC01-T_AAACCTGAGGTAGCTG,SMC01,T_cell,1249,Not cycling,10x,1742,SMC01,Colorectal Cancer,F,64,local,primary,rectum,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
2,SMC01-T_AAACCTGCATACGCCG,SMC01,Malignant,4787,Intermediate,10x,1742,SMC01,Colorectal Cancer,F,64,local,primary,rectum,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
3,SMC01-T_AAACCTGGTCGCATAT,SMC01,Malignant,5175,Not cycling,10x,1742,SMC01,Colorectal Cancer,F,64,local,primary,rectum,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
4,SMC01-T_AAACCTGGTTCCTCCA,SMC01,T_cell,1245,Not cycling,10x,1742,SMC01,Colorectal Cancer,F,64,local,primary,rectum,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21652,SMC25-T_TTTGCGCAGACACGAC,SMC25,Malignant,5016,Not cycling,10x,673,SMC25,Colorectal Cancer,F,57,metastatic,primary,sigmoid,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
21653,SMC25-T_TTTGCGCCATGGAATA,SMC25,,3376,,10x,673,SMC25,Colorectal Cancer,F,57,metastatic,primary,sigmoid,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
21654,SMC25-T_TTTGGTTCAACGCACC,SMC25,B_cell,1745,Not cycling,10x,673,SMC25,Colorectal Cancer,F,57,metastatic,primary,sigmoid,naive,,Colorectal,tumor,Data_Lee2020_Colorectal
21655,SMC25-T_TTTGGTTGTAGGGTAC,SMC25,Epithelial,2765,G2/M,10x,673,SMC25,Colorectal Cancer,F,57,metastatic,primary,sigmoid,naive,,Colorectal,tumor,Data_Lee2020_Colorectal


In [26]:
adata.obs['cell_subtype'] = 'NaN'

In [28]:
adata.obs['source'] = 'tumor'

In [29]:
adata.obs['study'] = 'Data_Lee2020_Colorectal'

In [27]:
adata.obs['category'] = 'Colorectal'

In [30]:
output_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Lee2020_Colorectal.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Colorectal/Data_Lee2020_Colorectal.h5ad


#### 3.Li2017_Colorectal

In [62]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Li2017_Colorectal"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Step 4: Ensure columns are strings
cells['sample'] = cells['sample'].astype(str).str.strip()
samples['sample'] = samples['sample'].astype(str).str.strip()

# Step 5: Extract base sample ID (e.g., 'CRC04' from 'CRC04_tumour')
cells['sample_base'] = cells['sample'].str.extract(r'^(CRC\d+)')

# Step 6: Merge on base sample ID
cells_merged = cells.merge(samples, left_on='sample_base', right_on='sample', how='left')

# Step 7: Drop helper column used for merge
cells_merged = cells_merged.drop(columns=['sample_base'])

# Step 8: Assign metadata to AnnData
adata.obs = cells_merged


In [63]:
adata

AnnData object with n_obs × n_vars = 590 × 55017
    obs: 'cell_name', 'sample_x', 'patient_x', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'source', 'sample_y', 'technology', 'n_cells', 'patient_y', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [66]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'sample_x'     
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [46]:
adata.obs['technology'].value_counts()

Series([], Name: count, dtype: int64)

In [68]:
adata.obs['cell_subtype'] = 'NaN'

In [69]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [70]:
adata.obs = adata.obs.rename(columns={"sample_y": "sample"})

In [71]:
adata.obs['category'] = 'Colorectal'

In [72]:
adata.obs['study'] = 'Data_Li2017_Colorectal'

In [73]:
adata.obs

Unnamed: 0,cell_name,patient,cell_type,complexity,cell_cycle_phase,source,sample,technology,n_cells,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,category,study
0,RHC3546,CRC01,,4544,,tumour,CRC01,SmartSeq2,44,Colorectal Cancer,M,52,local,primary,Rectosigmoid,NAIVE,,Colorectal,Data_Li2017_Colorectal
1,RHC3552,CRC01,Malignant,1868,,tumour,CRC01,SmartSeq2,44,Colorectal Cancer,M,52,local,primary,Rectosigmoid,NAIVE,,Colorectal,Data_Li2017_Colorectal
2,RHC3553,CRC01,Malignant,2004,,tumour,CRC01,SmartSeq2,44,Colorectal Cancer,M,52,local,primary,Rectosigmoid,NAIVE,,Colorectal,Data_Li2017_Colorectal
3,RHC3555,CRC01,,2311,,tumour,CRC01,SmartSeq2,44,Colorectal Cancer,M,52,local,primary,Rectosigmoid,NAIVE,,Colorectal,Data_Li2017_Colorectal
4,RHC3556,CRC01,Malignant,4305,,tumour,CRC01,SmartSeq2,44,Colorectal Cancer,M,52,local,primary,Rectosigmoid,NAIVE,,Colorectal,Data_Li2017_Colorectal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,RHL2871,CRC08,Fibroblast,2560,,normal mucosa,CRC08,SmartSeq2,63,Colorectal Cancer,M,75,local,primary,Lower Rectum,NAIVE,,Colorectal,Data_Li2017_Colorectal
586,RHL2873,CRC08,Epithelial,3190,Not cycling,normal mucosa,CRC08,SmartSeq2,63,Colorectal Cancer,M,75,local,primary,Lower Rectum,NAIVE,,Colorectal,Data_Li2017_Colorectal
587,RHL2875,CRC08,Epithelial,3176,Not cycling,normal mucosa,CRC08,SmartSeq2,63,Colorectal Cancer,M,75,local,primary,Lower Rectum,NAIVE,,Colorectal,Data_Li2017_Colorectal
588,RHL2879,CRC08,B_cell,3535,,normal mucosa,CRC08,SmartSeq2,63,Colorectal Cancer,M,75,local,primary,Lower Rectum,NAIVE,,Colorectal,Data_Li2017_Colorectal


In [74]:
output_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Li2017_Colorectal.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Colorectal/Data_Li2017_Colorectal.h5ad


#### 4. Zhang2018_Colorectal

In [75]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Zhang2018_Colorectal"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [96]:
adata

AnnData object with n_obs × n_vars = 11138 × 22902
    obs: 'cell_name', 'sample', 'patient', 'source', 'cell_type', 'cell_subtype', 'complexity', 'n_cells', 'technology', 'cancer_type', 'age', 'sex'

In [86]:
adata.obs['msi_status'].value_counts()

msi_status
MSS    6777
MSI    4361
Name: count, dtype: int64

In [95]:
for col in ['histology', 'tnm_stage', 'stage', 'size', 'grade', 'msi_status',
            'patient_y', 'source_y', 'sorting', 'msi_status', 'cluster'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [89]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [90]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [105]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cell_type,cell_subtype,complexity,n_cells,technology,cancer_type,age,sex,cell_cycle_phase,disease_extent,sample_primary_met,site,treated_naive,category,study
0,NP71-20180123,P0123_N,P0123,Adjacent normal,T_cell,CD4,2182,360,Smart-seq2,Colorectal Cancer,65,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
1,NP710-20180123,P0123_N,P0123,Adjacent normal,T_cell,CD4,4759,360,Smart-seq2,Colorectal Cancer,65,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
2,NP711-20180123,P0123_N,P0123,Adjacent normal,T_cell,double-negative,1851,360,Smart-seq2,Colorectal Cancer,65,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
3,NP712-20180123,P0123_N,P0123,Adjacent normal,T_cell,,1675,360,Smart-seq2,Colorectal Cancer,65,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
4,NP713-20180123,P0123_N,P0123,Adjacent normal,T_cell,double-negative,2107,360,Smart-seq2,Colorectal Cancer,65,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11133,TTR92-20161228,P1228_T,P1228,Tumor,T_cell,,3626,390,Smart-seq2,Colorectal Cancer,77,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
11134,TTR93-20161228,P1228_T,P1228,Tumor,T_cell,,3902,390,Smart-seq2,Colorectal Cancer,77,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
11135,TTR94-20161228,P1228_T,P1228,Tumor,T_cell,,3996,390,Smart-seq2,Colorectal Cancer,77,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal
11136,TTR95-20161228,P1228_T,P1228,Tumor,T_cell,CD4,4740,390,Smart-seq2,Colorectal Cancer,77,Female,,,,,,Colorectal,Data_Zhang2018_Colorectal


In [98]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [99]:
adata.obs['disease_extent'] = 'NaN'

In [100]:
adata.obs['sample_primary_met'] = 'NaN'

In [101]:
adata.obs['site'] = 'NaN'

In [102]:
adata.obs['treated_naive'] = 'NaN'

In [103]:
adata.obs['category'] = 'Colorectal'

In [104]:
adata.obs['study'] = 'Data_Zhang2018_Colorectal'

In [106]:
output_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Zhang2018_Colorectal.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Colorectal/Data_Zhang2018_Colorectal.h5ad


#### 5.Pelka2021_Colorectal

In [133]:
import os
import pandas as pd
import scanpy as sc
import scipy.io
import gc
from scipy.sparse import vstack

# Base path
base_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Pelka2021_Colorectal"

# Step 1: Read gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
gene_names = genes[0].values

# Step 2: Read and concatenate expression matrices and cell metadata
all_exprs = []
all_cells = []

for i in range(1, 5):  # Group1 to Group4
    group_path = os.path.join(base_path, f"Group{i}")
    
    # Load matrix
    mtx_path = os.path.join(group_path, f"Exp_data_UMIcounts{i}.mtx")
    expr = sc.read_mtx(mtx_path).T  # transpose: cells x genes
    all_exprs.append(expr.X)
    
    # Load cell metadata
    cells_path = os.path.join(group_path, f"Cells{i}.csv")
    cells_df = pd.read_csv(cells_path)
    all_cells.append(cells_df)

# Combine all groups
combined_expr = vstack(all_exprs)
combined_cells = pd.concat(all_cells, ignore_index=True)

# Create AnnData object
adata = sc.AnnData(X=combined_expr)
adata.var_names = gene_names
adata.var_names_make_unique()
adata.obs = combined_cells

# Step 3: Read and merge sample metadata
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))
adata.obs = adata.obs.merge(samples, on="sample", how="left")

gc.collect()


  cells_df = pd.read_csv(cells_path)


1315

In [134]:
adata

AnnData object with n_obs × n_vars = 370115 × 43113
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'sample_type_x', 'HistologicTypeSimple_x', 'MMR_IHC_x', 'MMRStatus_x', 'MLH1Status_x', 'MMRMLH1Tumor_x', 'TissueSite_detailed_x', 'TissueSiteSimple_x', 'HistologicGrade_detailed_x', 'HistologicGradeSimple_x', 'TumorStage_x', 'NodeStatus_detailed_x', 'NodeStatusSimple_x', 'MetastasisStatus_x', 'TumorSize_x', 'SizeQuantile_x', 'PID_x', 'Sex_x', 'Age_x', 'Ethnicity_x', 'Race_x', 'patient', 'cancer_type', 'technology', 'n_cells', 'sample_type_y', 'HistologicTypeSimple_y', 'MSIStatus', 'MMR_IHC_y', 'MMRStatus_y', 'MLH1Status_y', 'MMRMLH1Tumor_y', 'TissueSite_detailed_y', 'TissueSiteSimple_y', 'HistologicGrade_detailed_y', 'HistologicGradeSimple_y', 'TumorStage_y', 'NodeStatus_detailed_y', 'NodeStatusSimple_y', 'MetastasisStatus_y', 'TumorSize_y', 'SizeQuantile_

In [118]:
adata.obs['MetastasisStatus_x'].value_counts()

MetastasisStatus_x
not entered (Mx)                                                                                                                 247098
M1a                                                                                                                                4085
pM1c (Metastases the peritoneal surface, alone or with other site or organ metastases): Sites involved: Liver and peritoneum.      3091
M1c                                                                                                                                2977
Name: count, dtype: int64

In [141]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'MMR_IHC_x', 'MMRStatus_x', 'MLH1Status_x', 'MMRMLH1Tumor_x',
            'HistologicGrade_detailed_x', 'HistologicGradeSimple_x', 'TumorStage_x',
            'NodeStatus_detailed_x', 'NodeStatusSimple_x', 'TumorSize_x', 'SizeQuantile_x', 'PID_x',
            'Ethnicity_x', 'Race_x', 'sample_type_y', 'HistologicTypeSimple_y', 'MSIStatus', 'MMR_IHC_y',
            'MMRStatus_y', 'MLH1Status_y', 'MMRMLH1Tumor_y', 'TissueSite_detailed_y', 'TissueSiteSimple_y',
            'HistologicGrade_detailed_y', 'HistologicGradeSimple_y', 'TumorStage_y', 'NodeStatus_detailed_y',
            'NodeStatusSimple_y', 'MetastasisStatus_y', 'TumorSize_y', 'SizeQuantile_y', 'PID_y', 'Sex_y',
            'Age_y', 'Ethnicity_y', 'Race_y', 'HistologicTypeSimple_x', 'TissueSiteSimple_x'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [154]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,site,disease_extent,sex,age,patient,cancer_type,technology,n_cells,sample_primary_met,treated_naive,study,category
0,C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT,C103_T,Malignant,cE01 (Stem/TA-like),2950,Not cycling,Tumor,LEFT colon (sigmoid colon),not entered (Mx),M,45,C103,Colorectal Cancer,10x,6096,,,Pelka2021_Colorectal,Colorectal
1,C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT,C103_T,Malignant,cE01 (Stem/TA-like),406,,Tumor,LEFT colon (sigmoid colon),not entered (Mx),M,45,C103,Colorectal Cancer,10x,6096,,,Pelka2021_Colorectal,Colorectal
2,C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG,C103_T,Malignant,cE03 (Stem/TA-like prolif),6509,G1/S,Tumor,LEFT colon (sigmoid colon),not entered (Mx),M,45,C103,Colorectal Cancer,10x,6096,,,Pelka2021_Colorectal,Colorectal
3,C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT,C103_T,Malignant,cE01 (Stem/TA-like),3357,Not cycling,Tumor,LEFT colon (sigmoid colon),not entered (Mx),M,45,C103,Colorectal Cancer,10x,6096,,,Pelka2021_Colorectal,Colorectal
4,C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC,C103_T,Malignant,cE01 (Stem/TA-like),6147,Not cycling,Tumor,LEFT colon (sigmoid colon),not entered (Mx),M,45,C103,Colorectal Cancer,10x,6096,,,Pelka2021_Colorectal,Colorectal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370110,C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC,C173_T,Macrophage,cM02 (Macrophage-like),3006,Not cycling,Tumor,LEFT(descending),not entered (Mx),F,49,C173,Colorectal Cancer,10x,2649,,,Pelka2021_Colorectal,Colorectal
370111,C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG,C173_T,Malignant,cE03 (Stem/TA-like prolif),4216,G1/S,Tumor,LEFT(descending),not entered (Mx),F,49,C173,Colorectal Cancer,10x,2649,,,Pelka2021_Colorectal,Colorectal
370112,C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC,C173_T,Malignant,cE03 (Stem/TA-like prolif),772,,Tumor,LEFT(descending),not entered (Mx),F,49,C173,Colorectal Cancer,10x,2649,,,Pelka2021_Colorectal,Colorectal
370113,C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT,C173_T,T_cell,cTNI08 (CD4+ Treg),504,,Tumor,LEFT(descending),not entered (Mx),F,49,C173,Colorectal Cancer,10x,2649,,,Pelka2021_Colorectal,Colorectal


In [139]:
adata.obs = adata.obs.rename(columns={"sample_type_x": "source"})

In [150]:
adata.obs = adata.obs.rename(columns={"Sex_x": "sex"})

In [151]:
adata.obs = adata.obs.rename(columns={"Age_x": "age"})

In [140]:
adata.obs = adata.obs.rename(columns={"TissueSite_detailed_x": "site"})

In [143]:
adata.obs = adata.obs.rename(columns={"MetastasisStatus_x": "disease_extent"})

In [145]:
adata.obs['sample_primary_met'] = 'NaN'

In [146]:
adata.obs['treated_naive'] = 'NaN'

In [147]:
adata.obs['study'] = 'Pelka2021_Colorectal'

In [148]:
adata.obs['category'] = 'Colorectal'

In [153]:
output_path = "/home/ubuntu/Downloads/Data_Colorectal/Data_Pelka2021_Colorectal.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Colorectal/Data_Pelka2021_Colorectal.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [4]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Colorectal/Data_Chen2021_Colorectal.h5ad",
    "/home/ubuntu/Downloads/Data_Colorectal/Data_Lee2020_Colorectal.h5ad",
    "/home/ubuntu/Downloads/Data_Colorectal/Data_Li2017_Colorectal.h5ad",
    "/home/ubuntu/Downloads/Data_Colorectal/Data_Zhang2018_Colorectal.h5ad",
    "/home/ubuntu/Downloads/Data_Colorectal/Data_Pelka2021_Colorectal.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

gc.collect()
# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

gc.collect()
# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

gc.collect()
# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Colorectal/Colorectal_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Colorectal/Colorectal_Combined.h5ad
