In [2]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc

## Liver-Biliary

#### 1. Ma2019

In [3]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Ma2019_Liver-Biliary"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [4]:
adata

AnnData object with n_obs × n_vars = 3018 × 17917
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'disease', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'genetic_hormonal_features', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS'

In [5]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [13]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category,source,cell_subtype
0,AAACCTGAGATAGCAT-3,C26,Endothelial,1247,Not cycling,10x,277,C26,Cholangiocarcinoma,male,63,metastatic,primary,liver,treated,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
1,AAACCTGAGATGTAAC-9,H38,Malignant,3524,Not cycling,10x,1001,H38,Hepatocellular Carcinoma,male,74,local,primary,liver,naive,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
2,AAACCTGAGGAATTAC-9,H38,Fibroblast,1562,Not cycling,10x,1001,H38,Hepatocellular Carcinoma,male,74,local,primary,liver,naive,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
3,AAACCTGAGGTACTCT-22,C66,Malignant,2446,Not cycling,10x,388,C66,Cholangiocarcinoma,female,71,metastatic,primary,liver,treated,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
4,AAACCTGAGTCCAGGA-4,C25,Malignant,1144,Not cycling,10x,159,C25,Cholangiocarcinoma,female,47,metastatic,primary,liver,treated,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3013,TTTGTCAAGCTGAACG-22,C66,Malignant,1397,Not cycling,10x,388,C66,Cholangiocarcinoma,female,71,metastatic,primary,liver,treated,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
3014,TTTGTCAAGTGGTAAT-9,H38,Malignant,2200,Not cycling,10x,1001,H38,Hepatocellular Carcinoma,male,74,local,primary,liver,naive,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
3015,TTTGTCACAATAACGA-9,H38,Endothelial,2895,Not cycling,10x,1001,H38,Hepatocellular Carcinoma,male,74,local,primary,liver,naive,Data_Ma2019_Liver-Biliary,Liver-Biliary,,
3016,TTTGTCACAGCTGCTG-3,C26,Malignant,4514,Not cycling,10x,277,C26,Cholangiocarcinoma,male,63,metastatic,primary,liver,treated,Data_Ma2019_Liver-Biliary,Liver-Biliary,,


In [7]:
adata.obs['study'] = 'Data_Ma2019_Liver-Biliary'

In [8]:
adata.obs['category'] = 'Liver-Biliary'

In [9]:
adata.obs['source'] = 'NaN'

In [10]:
adata.obs['cell_subtype'] = 'NaN'

In [12]:
del adata.obs['disease']

In [14]:
output_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Ma2019_Liver-Biliary.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Liver-Biliary/Data_Ma2019_Liver-Biliary.h5ad


#### 2.Sharma2020

In [15]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sharma2020_Liver-Biliary"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [19]:
adata

AnnData object with n_obs × n_vars = 73589 × 33694
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source_x', 'cancer_type', 'technology', 'n_cells'

In [18]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'viral_x', 'patient_y', 'source_y', 'viral_y'       
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [17]:
adata.obs['viral_x'].value_counts()

viral_x
Negative    57588
Positive    16001
Name: count, dtype: int64

In [33]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source_x,cancer_type,technology,n_cells,age,sex,disease_extent,sample_primary_met,site,treated_naive,study,category
0,AAACCTGGTCGGGTCT-1,P1_N,P1,T_cell,CD8+ T cell,502,,normal,Hepatocellular Carcinoma,10x,401,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
1,AAAGCAAGTTCGAATC-1,P1_N,P1,T_cell,CD4+ T cell,414,,normal,Hepatocellular Carcinoma,10x,401,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
2,AACCATGAGTAATCCC-1,P1_N,P1,NK_cell,NK cell,1276,Not cycling,normal,Hepatocellular Carcinoma,10x,401,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
3,AACCGCGAGCTCCTCT-1,P1_N,P1,T_cell,CD8+ T cell,787,,normal,Hepatocellular Carcinoma,10x,401,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
4,AACCGCGCACATGTGT-1,P1_N,P1,T_cell,CD4+ T cell,591,,normal,Hepatocellular Carcinoma,10x,401,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73584,TTTGGTTTCCGCATCT-58,P15_T4T5,P15,Hepatocyte,Hepatocyte,2197,Not cycling,tumor,Hepatocellular Carcinoma,10x,1865,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
73585,TTTGGTTTCTTCAACT-58,P15_T4T5,P15,Hepatocyte,Hepatocyte,428,,tumor,Hepatocellular Carcinoma,10x,1865,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
73586,TTTGTCAAGGATGCGT-58,P15_T4T5,P15,Hepatocyte,Hepatocyte,477,,tumor,Hepatocellular Carcinoma,10x,1865,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary
73587,TTTGTCATCGAATGCT-58,P15_T4T5,P15,Hepatocyte,Hepatocyte,894,,tumor,Hepatocellular Carcinoma,10x,1865,,,,,liver,,Data_Sharma2020_Liver-Biliary,Liver-Biliary


In [21]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [111]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [23]:
adata.obs['age'] = 'NaN'

In [24]:
adata.obs['sex'] = 'NaN'

In [27]:
adata.obs['disease_extent'] = 'NaN'

In [28]:
adata.obs['sample_primary_met'] = 'NaN'

In [29]:
adata.obs['site'] = 'liver'

In [30]:
adata.obs['treated_naive'] = 'NaN'

In [31]:
adata.obs['study'] = 'Data_Sharma2020_Liver-Biliary'

In [32]:
adata.obs['category'] = 'Liver-Biliary'

In [112]:
output_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sharma2020_Liver-Biliary.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sharma2020_Liver-Biliary.h5ad


#### 3.Sun2021

In [35]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sun2021_Liver-Biliary"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [40]:
adata

AnnData object with n_obs × n_vars = 16498 × 19744
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'source', 'disease', 'cell_subtype', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [37]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [39]:
adata.obs = adata.obs.rename(columns={"cell_subtype_clusters": "cell_subtype"})

In [41]:
adata.obs['category'] = 'Liver-Biliary'

In [42]:
adata.obs['study'] = 'Sun2021_Liver-Biliary'

In [44]:
del adata.obs['disease']

In [45]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,source,cell_subtype,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,category,study
0,P01_T_0001,P01,Malignant,8444,Not cycling,Tumor,C10_Tumor,10x,119,P01,Hepatocellular Carcinoma,F,58,,,,,Liver-Biliary,Sun2021_Liver-Biliary
1,P01_T_0003,P01,Malignant,6481,Not cycling,Tumor,C10_Tumor,10x,119,P01,Hepatocellular Carcinoma,F,58,,,,,Liver-Biliary,Sun2021_Liver-Biliary
2,P01_T_0004,P01,NK_cell,7517,Not cycling,Tumor,C4_NK,10x,119,P01,Hepatocellular Carcinoma,F,58,,,,,Liver-Biliary,Sun2021_Liver-Biliary
3,P01_T_0007,P01,Malignant,6625,Not cycling,Tumor,C10_Tumor,10x,119,P01,Hepatocellular Carcinoma,F,58,,,,,Liver-Biliary,Sun2021_Liver-Biliary
4,P01_T_0008,P01,Endothelial,7891,Not cycling,Tumor,C17_Endo.,10x,119,P01,Hepatocellular Carcinoma,F,58,,,,,Liver-Biliary,Sun2021_Liver-Biliary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16493,P19_T_0138,P19,T_cell,8611,Not cycling,Tumor,C1_Tcell,10x,147,P19,Hepatocellular Carcinoma,M,45,,,,,Liver-Biliary,Sun2021_Liver-Biliary
16494,P19_T_0139,P19,Malignant,8255,Not cycling,Tumor,C14_Tumor,10x,147,P19,Hepatocellular Carcinoma,M,45,,,,,Liver-Biliary,Sun2021_Liver-Biliary
16495,P19_T_0143,P19,Malignant,9421,Not cycling,Tumor,C14_Tumor,10x,147,P19,Hepatocellular Carcinoma,M,45,,,,,Liver-Biliary,Sun2021_Liver-Biliary
16496,P19_T_0144,P19,T_cell,8692,Not cycling,Tumor,C3_Tcell,10x,147,P19,Hepatocellular Carcinoma,M,45,,,,,Liver-Biliary,Sun2021_Liver-Biliary


In [46]:
output_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sun2021_Liver-Biliary.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sun2021_Liver-Biliary.h5ad


#### 4. Zheng2017

In [48]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zheng2017_Liver-Biliary"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [62]:
adata

AnnData object with n_obs × n_vars = 5063 × 22902
    obs: 'cell_name', 'sample', 'patient', 'source', 'cell_type', 'cell_subtype', 'complexity', 'technology', 'n_cells', 'cancer_type', 'sample_primary_met', 'age', 'sex'

In [57]:
adata.obs[''stage''].value_counts()

sorting
CD3+, CD8+            1843
CD3+, CD4+, CD25-     1613
CD3+, CD4+, CD25hi    1607
Name: count, dtype: int64

In [61]:
for col in ['patient_y', 'source_y', 'sorting','cluster', 'stage'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [58]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [59]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [64]:
adata.obs['cell_cycle_phase'] = 'Not cycling'

In [65]:
adata.obs['disease_extent'] = 'NaN'

In [66]:
adata.obs['site'] = 'liver'

In [67]:
adata.obs['treated_naive'] = 'NaN'

In [71]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cell_type,cell_subtype,complexity,technology,n_cells,cancer_type,sample_primary_met,age,sex,cell_cycle_phase,disease_extent,site,treated_naive,category,study
0,PTC03-0205,P0205_P,P0205,Peripheral blood,T_cell,CD8+,2761,Smart-seq2,318,Hepatocellular Carcinoma,primary,53,Male,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
1,PTC10-0205,P0205_P,P0205,Peripheral blood,,,3200,Smart-seq2,318,Hepatocellular Carcinoma,primary,53,Male,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
2,PTC100-0205,P0205_P,P0205,Peripheral blood,T_cell,CD8+,3224,Smart-seq2,318,Hepatocellular Carcinoma,primary,53,Male,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
3,PTC103-0205,P0205_P,P0205,Peripheral blood,T_cell,CD8+,4850,Smart-seq2,318,Hepatocellular Carcinoma,primary,53,Male,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
4,PTC104-0205,P0205_P,P0205,Peripheral blood,T_cell,CD8+,3166,Smart-seq2,318,Hepatocellular Carcinoma,primary,53,Male,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5058,TTS47-1202,P1202t_T,P1202t,Tumor,,,1935,Tang2010 protocol,86,Hepatocellular Carcinoma,primary,26,Female,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
5059,TTS48-1202,P1202t_T,P1202t,Tumor,,,5254,Tang2010 protocol,86,Hepatocellular Carcinoma,primary,26,Female,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
5060,TTS5-1202,P1202t_T,P1202t,Tumor,,,2435,Tang2010 protocol,86,Hepatocellular Carcinoma,primary,26,Female,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary
5061,TTS7-1202,P1202t_T,P1202t,Tumor,,,1145,Tang2010 protocol,86,Hepatocellular Carcinoma,primary,26,Female,Not cycling,,liver,,Liver-Biliary,Zheng2017_Liver-Biliary


In [69]:
adata.obs['category'] = 'Liver-Biliary'

In [70]:
adata.obs['study'] = 'Zheng2017_Liver-Biliary'

In [72]:
output_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zheng2017_Liver-Biliary.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zheng2017_Liver-Biliary.h5ad


#### 5.Zhang2019

In [74]:

# Set base path
base_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zhang2019_Liver-Biliary"

# === Load 10X Data ===
path_10x = os.path.join(base_path, "10X")
adata_10x = sc.read_mtx(os.path.join(path_10x, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
genes_10x = pd.read_csv(os.path.join(path_10x, "Genes.txt"), header=None)[0].tolist()
cells_10x = pd.read_csv(os.path.join(path_10x, "Cells.csv"), index_col=0)

adata_10x.var_names = genes_10x
adata_10x.obs = cells_10x
adata_10x.obs['technology'] = '10X'

# === Load SmartSeq2 Data ===
path_ss2 = os.path.join(base_path, "SmartSeq2")
adata_ss2 = sc.read_mtx(os.path.join(path_ss2, "Exp_data_UMIcounts.mtx")).T
genes_ss2 = pd.read_csv(os.path.join(path_ss2, "Genes.txt"), header=None)[0].tolist()
cells_ss2 = pd.read_csv(os.path.join(path_ss2, "Cells.csv"), index_col=0)

adata_ss2.var_names = genes_ss2
adata_ss2.obs = cells_ss2
adata_ss2.obs['technology'] = 'SmartSeq2'

# === Align by common genes ===
common_genes = adata_10x.var_names.intersection(adata_ss2.var_names)
adata_10x = adata_10x[:, common_genes].copy()
adata_ss2 = adata_ss2[:, common_genes].copy()

# === Concatenate ===
adata_combined = adata_10x.concatenate(
    adata_ss2,
    batch_key='batch',
    batch_categories=['10X', 'SmartSeq2'],
    index_unique=None
)

# === Check uniqueness ===
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# === Merge sample metadata ===
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

# Preserve and reset index
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge with sample metadata
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.obs.shape[0] == adata_combined.shape[0], "Row count mismatch after metadata merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None

# === Save final object ===
#output_path = os.path.join(base_path, "Data_Jerby-Arnon2021_Sarcoma.h5ad")
#adata_combined.write(output_path)
#print(f"✅ AnnData object saved to: {output_path}")


  adata_combined = adata_10x.concatenate(


In [75]:
gc.collect()

2007

In [78]:
adata = adata_combined

In [102]:
adata

AnnData object with n_obs × n_vars = 73261 × 24011
    obs: 'sample', 'patient', 'source', 'site', 'cancer_type', 'cell_type', 'cell_subtype', 'complexity', 'technology', 'sorting', 'sex', 'age', 'sample_primary_met', 'treated_naive', 'cell_name', 'cell_cycle_phase', 'category', 'study'

In [80]:
adata.obs['cell_name'] = adata.obs_names

In [84]:
adata.obs = adata.obs.rename(columns={"location_x": "site"})

In [86]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [87]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [89]:
adata.obs = adata.obs.rename(columns={"cancer_type_x": "cancer_type"})

In [90]:
adata.obs = adata.obs.rename(columns={"technology_x": "technology"})

In [83]:
adata.obs['cell_cycle_phase'] = 'Not cycling'

In [97]:
for col in ['patient_y', 'source_y', 'location_y', 'cancer_type_y', 'n_cells', 
            'technology_y', 'histology', 'hbv_infection', 'stage',
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [99]:
adata.obs['category'] = 'Liver-Biliary'

In [100]:
adata.obs['study'] = 'Zhang2019_Liver-Biliary'

In [95]:
del adata.obs['batch']

In [103]:
del adata.obs['sorting']

In [105]:
adata.obs['disease_extent'] = 'NaN'

In [106]:
adata.obs

Unnamed: 0,sample,patient,source,site,cancer_type,cell_type,cell_subtype,complexity,technology,sex,age,sample_primary_met,treated_naive,cell_name,cell_cycle_phase,category,study,disease_extent
AAACCTGAGAGCCCAA-1-D20171109_A,D20171109_A,DSN09,ascites,ascites,Hepatocellular Carcinoma,Macrophage,Mac,890,10X,male,54,,naive,AAACCTGAGAGCCCAA-1-D20171109_A,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
AAACCTGGTGCTAGCC-1-D20171109_A,D20171109_A,DSN09,ascites,ascites,Hepatocellular Carcinoma,T_cell,CD8,931,10X,male,54,,naive,AAACCTGGTGCTAGCC-1-D20171109_A,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
AAACGGGAGAGACTAT-1-D20171109_A,D20171109_A,DSN09,ascites,ascites,Hepatocellular Carcinoma,T_cell,CD4/CD8,944,10X,male,54,,naive,AAACGGGAGAGACTAT-1-D20171109_A,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
AAACGGGAGCGTAATA-1-D20171109_A,D20171109_A,DSN09,ascites,ascites,Hepatocellular Carcinoma,Macrophage,Mac,1592,10X,male,54,,naive,AAACGGGAGCGTAATA-1-D20171109_A,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
AAACGGGAGCTGAAAT-1-D20171109_A,D20171109_A,DSN09,ascites,ascites,Hepatocellular Carcinoma,Dendritic,DC,1841,10X,male,54,,naive,AAACGGGAGCTGAAAT-1-D20171109_A,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TP9-88-20171109,D20171109_T,DSN09,tumor,tumor edge,Hepatocellular Carcinoma,Macrophage,Mac,4275,SmartSeq2,male,54,primary,naive,TP9-88-20171109,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
TP9-93-20171109,D20171109_T,DSN09,tumor,tumor edge,Hepatocellular Carcinoma,NK_cell,NK,5482,SmartSeq2,male,54,primary,naive,TP9-93-20171109,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
TP9-94-20171109,D20171109_T,DSN09,tumor,tumor edge,Hepatocellular Carcinoma,Macrophage,Mac,5421,SmartSeq2,male,54,primary,naive,TP9-94-20171109,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,
TP9-97-20171109,D20171109_T,DSN09,tumor,tumor edge,Hepatocellular Carcinoma,Macrophage,Mac,3310,SmartSeq2,male,54,primary,naive,TP9-97-20171109,Not cycling,Liver-Biliary,Zhang2019_Liver-Biliary,


In [107]:
output_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zhang2019_Liver-Biliary.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zhang2019_Liver-Biliary.h5ad


#### Data Merging

In [None]:
import scanpy as sc
import anndata
import os

In [113]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Ma2019_Liver-Biliary.h5ad",
    "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sharma2020_Liver-Biliary.h5ad",
    "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Sun2021_Liver-Biliary.h5ad",
    "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zhang2019_Liver-Biliary.h5ad",
    "/home/ubuntu/Downloads/Data_Liver-Biliary/Data_Zheng2017_Liver-Biliary.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Liver-Biliary/Liver-Biliary_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Liver-Biliary/Liver-Biliary_Combined.h5ad
