In [2]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc
import scipy
from scipy.sparse import vstack, csr_matrix

## Neuroendocrine

#### 1. Dong2020_Neuroendocrine

In [10]:
import os
import pandas as pd
import scanpy as sc
import scipy.io
import numpy as np
import gc

# Base path
base_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Dong2020_Neuroendocrine"

# Paths to Group1 and Group2
group_paths = {
    "Group1": os.path.join(base_path, "Group1"),
    "Group2": os.path.join(base_path, "Group2"),
}

# Initialize lists to collect group-wise data
adatas = []
for group_name, group_path in group_paths.items():
    # Read mtx and transpose (cells x genes)
    adata = sc.read_mtx(os.path.join(group_path, f"Exp_data_UMIcounts{group_name[-1]}.mtx")).transpose()
    
    # Add gene names
    genes = pd.read_csv(os.path.join(group_path, f"Genes{group_name[-1]}.txt"), header=None)
    adata.var_names = genes[0].values
    adata.var_names_make_unique()
    
    # Add cell metadata
    cells = pd.read_csv(os.path.join(group_path, f"Cells{group_name[-1]}.csv"))
    adata.obs = cells
    
    # Tag group info
    adata.obs["group"] = group_name

    # Collect this group's AnnData
    adatas.append(adata)

# Concatenate all groups into one AnnData object
adata = adatas[0].concatenate(adatas[1:], join='outer', index_unique=None)

# Read sample metadata
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge adata.obs with sample metadata on 'sample'
adata.obs = adata.obs.merge(samples, on="sample", how="left")

# Optional: garbage collection
gc.collect()


  adata = adatas[0].concatenate(adatas[1:], join='outer', index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


2296

In [11]:
adata

AnnData object with n_obs × n_vars = 55190 × 33972
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'group', 'batch', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [16]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'treatment_exposure', 'treatment_response', 'cell_lineage',
            'smoking_status', 'PY', 'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'group', 'batch'      
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [17]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source
0,AAACCTGTCTTTAGTC.1,Tumor_10,Malignant,2248,G1/S,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,
1,AAACGGGGTTTCCACC.1,Tumor_10,Malignant,3165,Not cycling,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,
2,AAAGATGCAGAGTGTG.1,Tumor_10,Malignant,3442,G2/M,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,
3,AAAGATGGTACTTCTT.1,Tumor_10,Malignant,3531,Not cycling,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,
4,AAAGCAAAGAGCTTCT.1,Tumor_10,Malignant,3106,G1/S,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55185,TTTGTTGCATGATAGA-1,Tumor_230,Malignant,2620,Not cycling,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,
55186,TTTGTTGCATTGCTTT-1,Tumor_230,Malignant,2764,Not cycling,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,
55187,TTTGTTGGTAACTTCG-1,Tumor_230,Malignant,4465,G1/S,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,
55188,TTTGTTGGTAGAGATT-1,Tumor_230,,2255,,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,


In [14]:
adata.obs['source'].value_counts()

source
kidney        22879
abdomen        4637
lung           3744
lymph node     3066
Name: count, dtype: int64

In [22]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,study,category,cell-subtype
0,AAACCTGTCTTTAGTC.1,Tumor_10,Malignant,2248,G1/S,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
1,AAACGGGGTTTCCACC.1,Tumor_10,Malignant,3165,Not cycling,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
2,AAAGATGCAGAGTGTG.1,Tumor_10,Malignant,3442,G2/M,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
3,AAAGATGGTACTTCTT.1,Tumor_10,Malignant,3531,Not cycling,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
4,AAAGCAAAGAGCTTCT.1,Tumor_10,Malignant,3106,G1/S,10x,886,Tumor_10,Neuroblastoma,M,3Y3M,metastatic,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55185,TTTGTTGCATGATAGA-1,Tumor_230,Malignant,2620,Not cycling,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
55186,TTTGTTGCATTGCTTT-1,Tumor_230,Malignant,2764,Not cycling,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
55187,TTTGTTGGTAACTTCG-1,Tumor_230,Malignant,4465,G1/S,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,
55188,TTTGTTGGTAGAGATT-1,Tumor_230,,2255,,10x,12125,Tumor_230,Neuroblastoma,M,4Y8M,local,primary,adrenal,naive,,Dong2020_Neuroendocrine,Neuroendocrine,


In [15]:
adata.obs['source'] = 'NaN'

In [21]:
adata.obs['cell-subtype'] = 'NaN'

In [18]:
adata.obs['study'] = 'Dong2020_Neuroendocrine'

In [19]:
adata.obs['category'] = 'Neuroendocrine'

In [24]:
output_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Dong2020_Neuroendocrine.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Neuroendocrine/Data_Dong2020_Neuroendocrine.h5ad


#### 2.Jansky2021_Neuroendocrine

In [25]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Jansky2021_Neuroendocrine"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect



<function gc.collect(generation=2)>

In [29]:
adata

AnnData object with n_obs × n_vars = 64769 × 26344
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'authors_cell_type', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [41]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'treatment_exposure', 'treatment_response', 'cell_lineage',
            'smoking_status', 'PY', 'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'group', 'batch' , 'authors_cell_type'     
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [40]:
adata.obs['authors_cell_type'].value_counts()

authors_cell_type
Tumor cells          59312
Immune cells          2655
Mesenchymal cells     1297
Liver cells            669
Endothelial cells      559
Schwann cells          277
Name: count, dtype: int64

In [97]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,cell_subtype,study,category
0,AAACCTGAGCAATATG_1,NB03,Malignant,1165,Not cycling,10x,5052,NB03,Neuroblastoma,F,> 18 months,,primary,thorax/abdomen,naive,,,Jansky2021_Neuroendocrine,Neuroendocrine
1,AAACCTGAGCGCTTAT_1,NB03,,901,,10x,5052,NB03,Neuroblastoma,F,> 18 months,,primary,thorax/abdomen,naive,,,Jansky2021_Neuroendocrine,Neuroendocrine
2,AAACCTGAGGCACATG_1,NB03,Malignant,1414,Not cycling,10x,5052,NB03,Neuroblastoma,F,> 18 months,,primary,thorax/abdomen,naive,,,Jansky2021_Neuroendocrine,Neuroendocrine
3,AAACCTGAGTGGCACA_1,NB03,Malignant,1051,Not cycling,10x,5052,NB03,Neuroblastoma,F,> 18 months,,primary,thorax/abdomen,naive,,,Jansky2021_Neuroendocrine,Neuroendocrine
4,AAACCTGAGTTCGCAT_1,NB03,,958,,10x,5052,NB03,Neuroblastoma,F,> 18 months,,primary,thorax/abdomen,naive,,,Jansky2021_Neuroendocrine,Neuroendocrine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64764,TTTGTCATCACGATGT_14,NB14,,380,,10x,7964,NB14,Neuroblastoma,M,> 18 months,metastatic,met,occipital subcutaneous bone metastasis,,,,Jansky2021_Neuroendocrine,Neuroendocrine
64765,TTTGTCATCAGTGTTG_14,NB14,Malignant,612,,10x,7964,NB14,Neuroblastoma,M,> 18 months,metastatic,met,occipital subcutaneous bone metastasis,,,,Jansky2021_Neuroendocrine,Neuroendocrine
64766,TTTGTCATCCACGCAG_14,NB14,,443,,10x,7964,NB14,Neuroblastoma,M,> 18 months,metastatic,met,occipital subcutaneous bone metastasis,,,,Jansky2021_Neuroendocrine,Neuroendocrine
64767,TTTGTCATCCTAGAAC_14,NB14,Malignant,519,,10x,7964,NB14,Neuroblastoma,M,> 18 months,metastatic,met,occipital subcutaneous bone metastasis,,,,Jansky2021_Neuroendocrine,Neuroendocrine


In [31]:
adata.obs['source'] = 'NaN'

In [32]:
adata.obs['cell_subtype'] = 'NaN'

In [33]:
adata.obs['study'] = 'Jansky2021_Neuroendocrine'

In [34]:
adata.obs['category'] = 'Neuroendocrine'

In [43]:
output_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Jansky2021_Neuroendocrine.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Neuroendocrine/Data_Jansky2021_Neuroendocrine.h5ad


#### 3.Rao2020_Neuroendocrine

In [44]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Rao2020_Neuroendocrine"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [47]:
adata

AnnData object with n_obs × n_vars = 4739 × 25006
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'complexity', 'patient_y', 'n_cells', 'technology', 'cancer_type', 'age', 'sex', 'disease_extent', 'sample_primary_met', 'site', 'additional_tumor_characterisics', 'treated_naive'

In [50]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'additional_tumor_characterisics' , 'patient_y'     
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [52]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [53]:
adata.obs['cell_subtype'] = 'NaN'

In [56]:
adata.obs['source'] = 'NaN'

In [54]:
adata.obs['category'] = 'Neuroendocrine'

In [55]:
adata.obs['study'] = 'Rao2020_Neuroendocrine'

In [57]:
adata.obs

Unnamed: 0,cell_name,sample,patient_x,cell_type,complexity,n_cells,technology,cancer_type,age,sex,disease_extent,sample_primary_met,site,treated_naive,cell_cycle_phase,cell_subtype,category,study,source
0,PriNET_AAACCTGAGAAACGCC-1,PriNET,NET,Fibroblast,959,1144,10x,Neuroendocrine Tumor,58,female,metastatic,primary,small intestine,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
1,PriNET_AAACCTGAGTGGGATC-1,PriNET,NET,Malignant,4707,1144,10x,Neuroendocrine Tumor,58,female,metastatic,primary,small intestine,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
2,PriNET_AAACCTGAGTGTACGG-1,PriNET,NET,,540,1144,10x,Neuroendocrine Tumor,58,female,metastatic,primary,small intestine,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
3,PriNET_AAACCTGTCTATCCCG-1,PriNET,NET,Endothelial,1027,1144,10x,Neuroendocrine Tumor,58,female,metastatic,primary,small intestine,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
4,PriNET_AAACGGGAGCCACCTG-1,PriNET,NET,Macrophage,1152,1144,10x,Neuroendocrine Tumor,58,female,metastatic,primary,small intestine,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4734,livMET_TTTGTCAGTCCGAGTC-1,livMET,NET,,450,3595,10x,Neuroendocrine Tumor,58,female,metastatic,met,liver,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
4735,livMET_TTTGTCAGTTATCACG-1,livMET,NET,Malignant,856,3595,10x,Neuroendocrine Tumor,58,female,metastatic,met,liver,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
4736,livMET_TTTGTCATCATACGGT-1,livMET,NET,Fibroblast,707,3595,10x,Neuroendocrine Tumor,58,female,metastatic,met,liver,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,
4737,livMET_TTTGTCATCCGTTGTC-1,livMET,NET,,333,3595,10x,Neuroendocrine Tumor,58,female,metastatic,met,liver,naive,,,Neuroendocrine,Rao2020_Neuroendocrine,


In [98]:
output_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Rao2020_Neuroendocrine.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Neuroendocrine/Data_Rao2020_Neuroendocrine.h5ad


#### 4. Kildisiute2021_Neuroendocrine

In [60]:

# Set base path
base_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Kildisiute2021_Neuroendocrine"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# === Load CEL-seq2 Data ===
path_ss2 = os.path.join(base_path, "CEL-seq2")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_TPM.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'CEL-seq2'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'CEL-seq2'],
    index_unique=None
)

# === Check uniqueness ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve and reset index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge with sample metadata
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None



  adata_combined = adata_10x.concatenate(


In [67]:
adata = adata_combined

In [71]:
adata

AnnData object with n_obs × n_vars = 19723 × 32118
    obs: 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology_x', 'batch', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [82]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'cell_QCpass', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'batch'  
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [73]:
adata.obs['cell_name'] = adata.obs_names

In [76]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [77]:
adata.obs['cell_subtype'] = 'NaN'

In [78]:
adata.obs['source'] = 'NaN'

In [79]:
adata.obs['category'] = 'Neuroendocrine'

In [80]:
adata.obs['study'] = 'Kildisiute2021_Neuroendocrine'

In [92]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [93]:
adata.obs

Unnamed: 0,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_name,cell_subtype,source,category,study
STDY7685340_AAACCTGCACATCCGG,PD42184,T_cell,1319,,10X,3721,PD42184,Normal,,,,primary,,Naive,STDY7685340_AAACCTGCACATCCGG,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
STDY7685340_AAACGGGGTTGAACTC,PD42184,Fibroblast,3222,,10X,3721,PD42184,Normal,,,,primary,,Naive,STDY7685340_AAACGGGGTTGAACTC,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
STDY7685340_AAACGGGGTTGTCGCG,PD42184,Fibroblast,2161,,10X,3721,PD42184,Normal,,,,primary,,Naive,STDY7685340_AAACGGGGTTGTCGCG,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
STDY7685340_AAACGGGTCCGCTGTT,PD42184,Fibroblast,1646,,10X,3721,PD42184,Normal,,,,primary,,Naive,STDY7685340_AAACGGGTCCGCTGTT,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
STDY7685340_AACACGTAGCCCTAAT,PD42184,Endothelial,3756,,10X,3721,PD42184,Normal,,,,primary,,Naive,STDY7685340_AACACGTAGCCCTAAT,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TM231-P7,000CGH,Malignant,8472,,CEL-seq2,584,000CGH,Neuroblastoma,,,metastatic,primary,,Treated,TM231-P7,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
TM231-P9,000CGH,Malignant,6243,,CEL-seq2,584,000CGH,Neuroblastoma,,,metastatic,primary,,Treated,TM231-P9,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
TM231-P10,000CGH,Malignant,3043,,CEL-seq2,584,000CGH,Neuroblastoma,,,metastatic,primary,,Treated,TM231-P10,,,Neuroendocrine,Kildisiute2021_Neuroendocrine
TM231-P11,000CGH,Malignant,5161,,CEL-seq2,584,000CGH,Neuroblastoma,,,metastatic,primary,,Treated,TM231-P11,,,Neuroendocrine,Kildisiute2021_Neuroendocrine


In [94]:
output_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Kildisiute2021_Neuroendocrine.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Neuroendocrine/Data_Kildisiute2021_Neuroendocrine.h5ad


#### Data Merging

In [None]:
import scanpy as sc
import anndata
import os

In [99]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Kildisiute2021_Neuroendocrine.h5ad",
    "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Dong2020_Neuroendocrine.h5ad",
    "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Jansky2021_Neuroendocrine.h5ad",
    "/home/ubuntu/Downloads/Data_Neuroendocrine/Data_Rao2020_Neuroendocrine.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Neuroendocrine/Neuroendocrine_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  concat_annot = pd.concat(
  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Neuroendocrine/Neuroendocrine_Combined.h5ad
