In [2]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc
import scipy.io
from scipy.sparse import vstack

## Pancreas

#### 1. Hwang2022_Pancreas

In [20]:
import os
import gc
import pandas as pd
import scanpy as sc
from scipy.sparse import vstack, csr_matrix

# Base path
base_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Hwang2022_Pancreas"

# Step 1: Read gene names
genes_path = os.path.join(base_path, "genes.txt")
genes = pd.read_csv(genes_path, header=None)
gene_names = genes[0].values

# Step 2: Read and concatenate expression matrices and cell metadata
all_exprs = []
all_cells = []

for i in range(1, 4):  # Group1 to Group3
    group_path = os.path.join(base_path, f"Group{i}")
    
    # Load expression matrix (Matrix Market format assumed)
    mtx_path = os.path.join(group_path, f"Exp_data_UMIcounts{i}.mtx")
    expr = sc.read_mtx(mtx_path).T  # Transpose to shape: cells x genes
    all_exprs.append(expr.X)
    
    # Load cell metadata
    cells_path = os.path.join(group_path, f"Cells{i}.csv")
    cells_df = pd.read_csv(cells_path)
    all_cells.append(cells_df)

# Combine all expression matrices and metadata
combined_expr = vstack(all_exprs)
combined_cells = pd.concat(all_cells, ignore_index=True)

# Step 3: Create AnnData object
adata = sc.AnnData(X=combined_expr)
adata.var_names = gene_names
adata.var_names_make_unique()
adata.obs = combined_cells

gc.collect()

# Step 4: Merge with sample metadata
samples_path = os.path.join(base_path, "Samples.csv")
samples = pd.read_csv(samples_path)

# Merge on the 'sample' column
adata.obs = adata.obs.merge(samples, on="sample", how="left")

# Optional: Check for non-string types that could break .write()
for col in adata.obs.columns:
    if adata.obs[col].dtype.name not in ["category", "object", "bool"]:
        adata.obs[col] = adata.obs[col].astype(str)

# Optional: Save as h5ad
#output_path = os.path.join(base_path, "Data_Pelka2021_Colorectal.h5ad")
#adata.write(output_path)
#print(f"✅ AnnData object saved to: {output_path}")

gc.collect()


0

In [21]:
adata

AnnData object with n_obs × n_vars = 224988 × 22164
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'disease', 'source_x', 'cancer_type', 'source_y', 'no_cells', 'technology'

In [22]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'HTAN.Parent.Data.File.ID', 'source_y','disease',
            'patient_y', 'source_y', 'Polyp_Type_y'        
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [16]:
adata.obs['technology'].value_counts()

technology
10X    224988
Name: count, dtype: int64

In [40]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,cancer_type,no_cells,technology,patient,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category
0,AGAACCTTCTGGGCAC-1-0,T25,Epithelial,Ductal (atypical),3590,Not cycling,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,2921,10X,T25,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
1,AAAGGTATCTCTCTAA-1-0,T25,Malignant,Malignant,3533,Not cycling,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,2921,10X,T25,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
2,TGAATGCGTCGCTCGA-1-0,T25,Malignant,Malignant,3800,Not cycling,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,2921,10X,T25,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
3,CGTGCTTTCACTGATG-1-0,T25,Malignant,Malignant,3722,Not cycling,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,2921,10X,T25,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
4,TTGGGCGAGTTGCTCA-1-0,T25,Malignant,Malignant,3859,Not cycling,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,2921,10X,T25,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224983,CTGTAGAAGATCGACG-1-45,T9,Epithelial,Ductal,130,,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,3276,10X,T9,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
224984,GTGGAAGTCAGAGTTC-1-45,T9,Epithelial,Ductal,143,,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,3276,10X,T9,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
224985,CTAACCCAGCTCGAAG-1-45,T9,Epithelial,Ductal,111,,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,3276,10X,T9,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas
224986,CTCCTTTCACAATGAA-1-45,T9,Epithelial,Ductal,149,,Pancreas Primary,Pancreatic Ductal Adenocarcinoma,3276,10X,T9,,,,,Pancreas,,Hwang2022_Pancreas,Pancreas


In [23]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [41]:
adata.obs = adata.obs.rename(columns={"no_cells": "n_cells"})

In [25]:
adata.obs['patient'] = adata.obs['sample'] 

In [26]:
adata.obs['sex'] = 'NaN'

In [27]:
adata.obs['age'] = 'NaN'

In [28]:
adata.obs['disease_extent'] = 'NaN'

In [29]:
adata.obs['sample_primary_met'] = 'NaN'

In [30]:
adata.obs['site'] = 'Pancreas'

In [31]:
adata.obs['treated_naive'] = 'NaN'

In [32]:
adata.obs['study'] = 'Hwang2022_Pancreas'

In [33]:
adata.obs['category'] = 'Pancreas'

In [42]:
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Hwang2022_Pancreas.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Pancreas/Data_Hwang2022_Pancreas.h5ad


#### 2.Lin2020_Pancreas

In [43]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Lin2020_Pancreas"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [45]:
adata

AnnData object with n_obs × n_vars = 14926 × 22217
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [44]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [46]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,P03:1,P03,Malignant,Malignant-EMT,3781,Not cycling,10x,830,P03,Pancreatic Ductal Adenocarcinoma,M,72,node positive,primary,Pancreas,
1,P03:2,P03,Malignant,Malignant-EMT,2055,Not cycling,10x,830,P03,Pancreatic Ductal Adenocarcinoma,M,72,node positive,primary,Pancreas,
2,P03:3,P03,Malignant,Malignant-EMT,1397,Not cycling,10x,830,P03,Pancreatic Ductal Adenocarcinoma,M,72,node positive,primary,Pancreas,
3,P03:4,P03,Fibroblast,Fibroblast,3468,Not cycling,10x,830,P03,Pancreatic Ductal Adenocarcinoma,M,72,node positive,primary,Pancreas,
4,P03:5,P03,Malignant,Malignant-EMT,6130,Intermediate,10x,830,P03,Pancreatic Ductal Adenocarcinoma,M,72,node positive,primary,Pancreas,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14921,MET06:2490,MET06,Malignant,Malignant-epithelial,2264,Not cycling,10x,2494,MET06,Pancreatic Ductal Adenocarcinoma,M,67,metastatic,met,Liver,
14922,MET06:2491,MET06,Malignant,Malignant-epithelial,1177,Not cycling,10x,2494,MET06,Pancreatic Ductal Adenocarcinoma,M,67,metastatic,met,Liver,
14923,MET06:2492,MET06,Malignant,Malignant-epithelial,1619,Not cycling,10x,2494,MET06,Pancreatic Ductal Adenocarcinoma,M,67,metastatic,met,Liver,
14924,MET06:2493,MET06,Malignant,Malignant-epithelial,1996,Not cycling,10x,2494,MET06,Pancreatic Ductal Adenocarcinoma,M,67,metastatic,met,Liver,


In [47]:
adata.obs['source'] = 'NaN'

In [48]:
adata.obs['study'] = 'Lin2020_Pancreas'

In [49]:
adata.obs['category'] = 'Pancreas'

In [50]:
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Lin2020_Pancreas.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Pancreas/Data_Lin2020_Pancreas.h5ad


#### 3.Moncada2020_Pancreas

In [51]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Moncada2020_Pancreas"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [54]:
adata

AnnData object with n_obs × n_vars = 3659 × 19738
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [53]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'sample_x'     
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [59]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,category,study
0,PDAC_A_1,PDAC_A,Epithelial,Acinar,1951,Not cycling,inDrop,1926,PDAC_A,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
1,PDAC_A_2,PDAC_A,Epithelial,Ductal - terminal ductal like,4266,Not cycling,inDrop,1926,PDAC_A,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
2,PDAC_A_3,PDAC_A,Epithelial,Ductal - terminal ductal like,3948,Intermediate,inDrop,1926,PDAC_A,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
3,PDAC_A_4,PDAC_A,Epithelial,Ductal - CRISP3 high/centroacinar like,4314,Not cycling,inDrop,1926,PDAC_A,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
4,PDAC_A_5,PDAC_A,Malignant,Malignant,3542,Not cycling,inDrop,1926,PDAC_A,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3654,PDAC_B_1729,PDAC_B,Monocyte,Monocyte,590,,inDrop,1733,PDAC_B,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
3655,PDAC_B_1730,PDAC_B,Malignant,Malignant,791,,inDrop,1733,PDAC_B,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
3656,PDAC_B_1731,PDAC_B,Epithelial,Ductal - terminal ductal like,785,,inDrop,1733,PDAC_B,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas
3657,PDAC_B_1732,PDAC_B,Monocyte,Monocyte,524,,inDrop,1733,PDAC_B,Pancreatic Ductal Adenocarcinoma,,,,primary,pancreas,naive,,Pancreas,Moncada2020_Pancreas


In [56]:
adata.obs['source'] = 'NaN'

In [57]:
adata.obs['category'] = 'Pancreas'

In [58]:
adata.obs['study'] = 'Moncada2020_Pancreas'

In [60]:
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Moncada2020_Pancreas.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Pancreas/Data_Moncada2020_Pancreas.h5ad


#### 4. Peng2019_Pancreas

In [61]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Peng2019_Pancreas"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [64]:
adata

AnnData object with n_obs × n_vars = 27953 × 18302
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [63]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'sample_x'     
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [70]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,category,study
0,T11_AAACCTGAGTCATCCA,T11,Fibroblast,Fibroblast,3940,Not cycling,10x,3133,T11,Pancreatic Ductal Adenocarcinoma,M,51,node positive,Primary,body and tail,NAIVE,,Pancreas,Peng2019_Pancreas
1,T11_AAACCTGCATGCAACT,T11,Epithelial,Ductal_1,2189,Not cycling,10x,3133,T11,Pancreatic Ductal Adenocarcinoma,M,51,node positive,Primary,body and tail,NAIVE,,Pancreas,Peng2019_Pancreas
2,T11_AAACCTGGTATATGGA,T11,Fibroblast,Fibroblast,2434,Not cycling,10x,3133,T11,Pancreatic Ductal Adenocarcinoma,M,51,node positive,Primary,body and tail,NAIVE,,Pancreas,Peng2019_Pancreas
3,T11_AAACCTGGTGAGTGAC,T11,,,1248,Not cycling,10x,3133,T11,Pancreatic Ductal Adenocarcinoma,M,51,node positive,Primary,body and tail,NAIVE,,Pancreas,Peng2019_Pancreas
4,T11_AAACCTGGTGCACCAC,T11,Macrophage,Macrophage,2422,Not cycling,10x,3133,T11,Pancreatic Ductal Adenocarcinoma,M,51,node positive,Primary,body and tail,NAIVE,,Pancreas,Peng2019_Pancreas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27948,T9_TTTGTCAGTCTCCACT,T9,Malignant,Malignant,4828,Not cycling,10x,2354,T9,Pancreatic Ductal Adenocarcinoma,M,36,local,Primary,head,NAIVE,,Pancreas,Peng2019_Pancreas
27949,T9_TTTGTCAGTTACCAGT,T9,Macrophage,Macrophage,1744,Not cycling,10x,2354,T9,Pancreatic Ductal Adenocarcinoma,M,36,local,Primary,head,NAIVE,,Pancreas,Peng2019_Pancreas
27950,T9_TTTGTCAGTTGCGCAC,T9,Epithelial,Ductal_1,2146,Not cycling,10x,2354,T9,Pancreatic Ductal Adenocarcinoma,M,36,local,Primary,head,NAIVE,,Pancreas,Peng2019_Pancreas
27951,T9_TTTGTCATCCGCATAA,T9,Malignant,Malignant,2505,Not cycling,10x,2354,T9,Pancreatic Ductal Adenocarcinoma,M,36,local,Primary,head,NAIVE,,Pancreas,Peng2019_Pancreas


In [66]:
adata.obs['source'] = 'NaN'

In [67]:
adata.obs['category'] = 'Pancreas'

In [68]:
adata.obs['study'] = 'Peng2019_Pancreas'

In [69]:
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Peng2019_Pancreas.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Pancreas/Data_Peng2019_Pancreas.h5ad


#### 5.Raghavan2021_Pancreas

In [71]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Raghavan2021_Pancreas"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [82]:
adata

AnnData object with n_obs × n_vars = 24412 × 16814
    obs: 'cell_name', 'sample', 'patient', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'disease', 'source', 'cancer_type', 'technology', 'n_cells'

In [118]:
adata.obs['MetastasisStatus_x'].value_counts()

MetastasisStatus_x
not entered (Mx)                                                                                                                 247098
M1a                                                                                                                                4085
pM1c (Metastases the peritoneal surface, alone or with other site or organ metastases): Sites involved: Liver and peritoneum.      3091
M1c                                                                                                                                2977
Name: count, dtype: int64

In [84]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 'patient_y',
            'mp_assignment', 'MMR_IHC_x', 'MMRStatus_x', 'MLH1Status_x', 'MMRMLH1Tumor_x',
            'HistologicGrade_detailed_x', 'HistologicGradeSimple_x', 'TumorStage_x', 'disease',
            'NodeStatus_detailed_x', 'NodeStatusSimple_x', 'TumorSize_x', 'SizeQuantile_x', 'PID_x',
            'Ethnicity_x', 'Race_x', 'sample_type_y', 'HistologicTypeSimple_y', 'MSIStatus', 'MMR_IHC_y',
            'MMRStatus_y', 'MLH1Status_y', 'MMRMLH1Tumor_y', 'TissueSite_detailed_y', 'TissueSiteSimple_y',
            'HistologicGrade_detailed_y', 'HistologicGradeSimple_y', 'TumorStage_y', 'NodeStatus_detailed_y',
            'NodeStatusSimple_y', 'MetastasisStatus_y', 'TumorSize_y', 'SizeQuantile_y', 'PID_y', 'Sex_y',
            'Age_y', 'Ethnicity_y', 'Race_y', 'HistologicTypeSimple_x', 'TissueSiteSimple_x'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [95]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,cancer_type,technology,n_cells,sex,age,disease_extent,sample_primary_met,treated_naive,study,category,site
0,Biopsy_PANFR0383_T2_1digest_AAACAATCTAGT,PANFR0383_Biopsy_None,PANFR0383,Malignant,Malignant,499,,Liver,Pancreatic Ductal Adenocarcinoma,Seq-Well,1008,,,,,,Raghavan2021_Pancreas,Pancreas,Liver
1,Biopsy_PANFR0383_T2_1digest_AAACACAGCTTT,PANFR0383_Biopsy_None,PANFR0383,Malignant,Malignant,1960,Not cycling,Liver,Pancreatic Ductal Adenocarcinoma,Seq-Well,1008,,,,,,Raghavan2021_Pancreas,Pancreas,Liver
2,Biopsy_PANFR0383_T2_1digest_AAACGTTGAGTG,PANFR0383_Biopsy_None,PANFR0383,Malignant,Malignant,1317,Not cycling,Liver,Pancreatic Ductal Adenocarcinoma,Seq-Well,1008,,,,,,Raghavan2021_Pancreas,Pancreas,Liver
3,Biopsy_PANFR0383_T2_1digest_AAAGCAGCATCC,PANFR0383_Biopsy_None,PANFR0383,Macrophage,Macrophage,614,,Liver,Pancreatic Ductal Adenocarcinoma,Seq-Well,1008,,,,,,Raghavan2021_Pancreas,Pancreas,Liver
4,Biopsy_PANFR0383_T2_1digest_AAATACAAGGAC,PANFR0383_Biopsy_None,PANFR0383,Malignant,Malignant,1336,Not cycling,Liver,Pancreatic Ductal Adenocarcinoma,Seq-Well,1008,,,,,,Raghavan2021_Pancreas,Pancreas,Liver
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24407,Biopsy_PANFR0473_T3_1_TGTCCTACTACC,PANFR0473_Biopsy_None,PANFR0473,T_cell,T_NK,918,,Lung,Pancreatic Ductal Adenocarcinoma,Seq-Well,1370,,,,,,Raghavan2021_Pancreas,Pancreas,Lung
24408,Biopsy_PANFR0473_T3_1_TGTGGATCCTAG,PANFR0473_Biopsy_None,PANFR0473,Plasma,Plasma,1229,Not cycling,Lung,Pancreatic Ductal Adenocarcinoma,Seq-Well,1370,,,,,,Raghavan2021_Pancreas,Pancreas,Lung
24409,Biopsy_PANFR0473_T3_1_TTATCGGTAACA,PANFR0473_Biopsy_None,PANFR0473,T_cell,T_NK,743,,Lung,Pancreatic Ductal Adenocarcinoma,Seq-Well,1370,,,,,,Raghavan2021_Pancreas,Pancreas,Lung
24410,Biopsy_PANFR0473_T3_1_TTCACCTAATGT,PANFR0473_Biopsy_None,PANFR0473,B_cell,B_cell,1554,Not cycling,Lung,Pancreatic Ductal Adenocarcinoma,Seq-Well,1370,,,,,,Raghavan2021_Pancreas,Pancreas,Lung


In [76]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [94]:
adata.obs['site']=adata.obs['source'].copy()

In [86]:
adata.obs['sex'] = 'NaN'

In [87]:
adata.obs['age'] = 'NaN'

In [88]:
adata.obs['disease_extent'] = 'NaN'

In [89]:
adata.obs['sample_primary_met'] = 'NaN'

In [90]:
adata.obs['treated_naive'] = 'NaN'

In [91]:
adata.obs['study'] = 'Raghavan2021_Pancreas'

In [92]:
adata.obs['category'] = 'Pancreas'

In [96]:
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Raghavan2021_Pancreas.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Pancreas/Data_Raghavan2021_Pancreas.h5ad


#### 6.Steele2020_Pancreas

In [97]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Steele2020_Pancreas"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [100]:
adata

AnnData object with n_obs × n_vars = 48570 × 32738
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'additional_tumor_characterisics', 'treated_naive'

In [99]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [105]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,additional_tumor_characterisics,treated_naive,source,category,study
0,PDAC_TISSUE_1_AAACGAAAGTGGAAAG-1,PDAC_TISSUE_1,,267,,10x,1633,PDAC_TISSUE_1,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
1,PDAC_TISSUE_1_AAACGAAGTAGGGTAC-1,PDAC_TISSUE_1,Malignant,4630,Not cycling,10x,1633,PDAC_TISSUE_1,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
2,PDAC_TISSUE_1_AAACGAAGTCATAGTC-1,PDAC_TISSUE_1,Malignant,7961,Not cycling,10x,1633,PDAC_TISSUE_1,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
3,PDAC_TISSUE_1_AAACGCTGTAATCAGA-1,PDAC_TISSUE_1,,158,,10x,1633,PDAC_TISSUE_1,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
4,PDAC_TISSUE_1_AAAGAACCATTAAAGG-1,PDAC_TISSUE_1,Macrophage,4032,Not cycling,10x,1633,PDAC_TISSUE_1,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48565,PDAC_TISSUE_16_TTTCCTCTCTGGTATG-1,PDAC_TISSUE_16,Macrophage,2726,Not cycling,10x,962,PDAC_TISSUE_16,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
48566,PDAC_TISSUE_16_TTTGCGCGTAGCGTCC-1,PDAC_TISSUE_16,Malignant,6013,Not cycling,10x,962,PDAC_TISSUE_16,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
48567,PDAC_TISSUE_16_TTTGGTTCATGCATGT-1,PDAC_TISSUE_16,Fibroblast,3309,Not cycling,10x,962,PDAC_TISSUE_16,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas
48568,PDAC_TISSUE_16_TTTGGTTTCAACGGCC-1,PDAC_TISSUE_16,Malignant,3680,Not cycling,10x,962,PDAC_TISSUE_16,Pancreatic Ductal Adenocarcinoma,,,,,,,,,Pancreas,Steele2020_Pancreas


In [102]:
adata.obs['source'] = 'NaN'

In [103]:
adata.obs['category'] = 'Pancreas'

In [104]:
adata.obs['study'] = 'Steele2020_Pancreas'

In [106]:
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Data_Steele2020_Pancreas.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Pancreas/Data_Steele2020_Pancreas.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [3]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Pancreas/Data_Hwang2022_Pancreas.h5ad",
    "/home/ubuntu/Downloads/Data_Pancreas/Data_Lin2020_Pancreas.h5ad",
    "/home/ubuntu/Downloads/Data_Pancreas/Data_Moncada2020_Pancreas.h5ad",
    "/home/ubuntu/Downloads/Data_Pancreas/Data_Peng2019_Pancreas.h5ad",
    "/home/ubuntu/Downloads/Data_Pancreas/Data_Raghavan2021_Pancreas.h5ad",
    "/home/ubuntu/Downloads/Data_Pancreas/Data_Steele2020_Pancreas.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

gc.collect()
# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

gc.collect()
# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

gc.collect()
# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Pancreas_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Pancreas/Pancreas_Combined.h5ad


In [7]:
adata_merged

AnnData object with n_obs × n_vars = 344508 × 36388
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'cancer_type', 'n_cells', 'technology', 'patient', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive', 'study', 'category'

In [6]:
del adata_merged.obs['additional_tumor_characterisics']

In [8]:
# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Pancreas/Pancreas_Combined.h5ad"
adata_merged.write(output_path)