In [4]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc

## Kidney

#### 1. Bi2021

In [3]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Bi2021_Kidney"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [23]:
adata

AnnData object with n_obs × n_vars = 34326 × 32718
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'gender', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive', 'study', 'category'

In [11]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'treatment_exposure', 'treatment_response', 'cell_lineage',
            'smoking_status', 'PY', 'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [19]:
adata.obs['source'].value_counts()

source
kidney        22879
abdomen        4637
lung           3744
lymph node     3066
Name: count, dtype: int64

In [25]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category
0,AAACCTGAGAATAGGG.p55,P55,T_cell,41BB-Hi CD8+ T cell,768,,abdomen,10x,4637,P55,Clear Cell Renal Cell Carcinoma,M,57,metastatic,met,Abdominal Mass,treated,Data_Bi2021_Kidney,Kidney
1,AAACCTGAGGCTAGGT.p55,P55,T_cell,41BB-Hi CD8+ T cell,864,,abdomen,10x,4637,P55,Clear Cell Renal Cell Carcinoma,M,57,metastatic,met,Abdominal Mass,treated,Data_Bi2021_Kidney,Kidney
2,AAACCTGCACTGTGTA.p55,P55,T_cell,41BB-Hi CD8+ T cell,982,,abdomen,10x,4637,P55,Clear Cell Renal Cell Carcinoma,M,57,metastatic,met,Abdominal Mass,treated,Data_Bi2021_Kidney,Kidney
3,AAACCTGCAGTCCTTC.p55,P55,T_cell,MitoHigh T-Helper,367,,abdomen,10x,4637,P55,Clear Cell Renal Cell Carcinoma,M,57,metastatic,met,Abdominal Mass,treated,Data_Bi2021_Kidney,Kidney
4,AAACCTGGTAAATGTG.p55,P55,T_cell,41BB-Lo CD8+ T cell,1185,Not cycling,abdomen,10x,4637,P55,Clear Cell Renal Cell Carcinoma,M,57,metastatic,met,Abdominal Mass,treated,Data_Bi2021_Kidney,Kidney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34321,TTGGAACGTGAGGGAG.p916,P916,T_cell,Effector T-Helper,893,,lymph node,10x,316,P916,Clear Cell Renal Cell Carcinoma,M,61,metastatic,met,Lymph node,naive,Data_Bi2021_Kidney,Kidney
34322,TTGTAGGGTATGAAAC.p916,P916,T_cell,41BB-Lo CD8+ T cell,2280,Not cycling,lymph node,10x,316,P916,Clear Cell Renal Cell Carcinoma,M,61,metastatic,met,Lymph node,naive,Data_Bi2021_Kidney,Kidney
34323,TTTACTGCACACATGT.p916,P916,T_cell,41BB-Hi CD8+ T cell,1961,Not cycling,lymph node,10x,316,P916,Clear Cell Renal Cell Carcinoma,M,61,metastatic,met,Lymph node,naive,Data_Bi2021_Kidney,Kidney
34324,TTTGTCAAGAGCAATT.p916,P916,T_cell,Effector T-Helper,1363,Not cycling,lymph node,10x,316,P916,Clear Cell Renal Cell Carcinoma,M,61,metastatic,met,Lymph node,naive,Data_Bi2021_Kidney,Kidney


In [14]:
del adata.obs['disease']

In [21]:
del adata.obs['sample_type']

In [24]:
del adata.obs['gender']

In [15]:
adata.obs['study'] = 'Data_Bi2021_Kidney'

In [16]:
adata.obs['category'] = 'Kidney'

In [26]:
output_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Bi2021_Kidney.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Kidney/Data_Bi2021_Kidney.h5ad


#### 2.Sharma2020

In [27]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Krishna2021_Kidney"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [55]:
adata

AnnData object with n_obs × n_vars = 167283 × 15588
    obs: 'cell_name', 'sample', 'patient', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'site', 'source', 'treated_naive', 'cancer_type', 'technology', 'n_cells', 'age', 'sex', 'disease_extent', 'sample_primary_met', 'study', 'category'

In [35]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'cluster', 'patient_y', 'source_y', 'type_y', 'treatment_y'    
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [45]:
adata.obs['treated_naive'].value_counts()

treated_naive
Ipi/Nivo mixed response       38660
Ipi/Nivo resistant            35427
Nivo-exposed                  30547
Untreated 1                   26177
Ipi/Nivo complete response    23259
Untreated 2                   13213
Name: count, dtype: int64

In [37]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [39]:
adata.obs = adata.obs.rename(columns={"type_x": "source"})

In [41]:
adata.obs = adata.obs.rename(columns={"source_x": "site"})

In [43]:
adata.obs = adata.obs.rename(columns={"treatment_x": "treated_naive"})

In [54]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,site,source,treated_naive,cancer_type,technology,n_cells,age,sex,disease_extent,sample_primary_met,study,category
0,AAACCTGAGCGTTGCC_1_UT1_Center,UT1_Center,UT1,T_cell,CD8A+ NK-like,1513,Not cycling,Center,Tumor,Untreated 1,Clear Cell Renal Cell Carcinoma,10x,5545,,,,,Data_Krishna2021_Kidney,Kidney
1,AAACCTGAGCTGCCCA_1_UT1_Center,UT1_Center,UT1,T_cell,CD8A+ Proliferating,1652,Not cycling,Center,Tumor,Untreated 1,Clear Cell Renal Cell Carcinoma,10x,5545,,,,,Data_Krishna2021_Kidney,Kidney
2,AAACCTGAGTGGACGT_1_UT1_Center,UT1_Center,UT1,T_cell,CD8A+ Exhausted IEG,1356,Not cycling,Center,Tumor,Untreated 1,Clear Cell Renal Cell Carcinoma,10x,5545,,,,,Data_Krishna2021_Kidney,Kidney
3,AAACCTGCAGGTCGTC_1_UT1_Center,UT1_Center,UT1,NK_cell,Conventional NK,1016,Not cycling,Center,Tumor,Untreated 1,Clear Cell Renal Cell Carcinoma,10x,5545,,,,,Data_Krishna2021_Kidney,Kidney
4,AAACCTGCAGGTCTCG_1_UT1_Center,UT1_Center,UT1,T_cell,CD8A+ Tissue-resident,902,,Center,Tumor,Untreated 1,Clear Cell Renal Cell Carcinoma,10x,5545,,,,,Data_Krishna2021_Kidney,Kidney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167278,TTTGTCATCCATGAAC_1_t4_PBMC,t4_PBMC,t4,T_cell,CD4+ Naive,582,,PBMC,PBMC,Ipi/Nivo complete response,Clear Cell Renal Cell Carcinoma,10x,10860,,,,,Data_Krishna2021_Kidney,Kidney
167279,TTTGTCATCGCGGATC_1_t4_PBMC,t4_PBMC,t4,NK_cell,Conventional NK,622,,PBMC,PBMC,Ipi/Nivo complete response,Clear Cell Renal Cell Carcinoma,10x,10860,,,,,Data_Krishna2021_Kidney,Kidney
167280,TTTGTCATCTCAAACG_1_t4_PBMC,t4_PBMC,t4,T_cell,CD4+ Naive,641,,PBMC,PBMC,Ipi/Nivo complete response,Clear Cell Renal Cell Carcinoma,10x,10860,,,,,Data_Krishna2021_Kidney,Kidney
167281,TTTGTCATCTGTCTCG_1_t4_PBMC,t4_PBMC,t4,NK_cell,Conventional NK,495,,PBMC,PBMC,Ipi/Nivo complete response,Clear Cell Renal Cell Carcinoma,10x,10860,,,,,Data_Krishna2021_Kidney,Kidney


In [48]:
adata.obs['age'] = 'NaN'

In [49]:
adata.obs['sex'] = 'NaN'

In [50]:
adata.obs['disease_extent'] = 'NaN'

In [51]:
adata.obs['sample_primary_met'] = 'NaN'

In [52]:
adata.obs['study'] = 'Data_Krishna2021_Kidney'

In [53]:
adata.obs['category'] = 'Kidney'

In [56]:
output_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Krishna2021_Kidney.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Kidney/Data_Krishna2021_Kidney.h5ad


#### 3.Obradovic2021

In [57]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Obradovic2021_Kidney"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [60]:
adata

AnnData object with n_obs × n_vars = 19781 × 19234
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'source', 'clusters_by_authors', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [59]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [62]:
adata.obs['site'].value_counts()

site
kidney     18348
kidney      1433
Name: count, dtype: int64

In [64]:
adata.obs['cell_subtype'] = 'NaN'

In [65]:
adata.obs['category'] = 'Kidney'

In [66]:
adata.obs['study'] = 'Data_Obradovic2021_Kidney'

In [68]:
del adata.obs['clusters_by_authors']

In [69]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,category,study
0,AAACCTGGTAGCACGA_2_1_1_1,Patient1,Endothelial,2988,Not cycling,Tumor,10x,957,Patient1,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
1,AAACGGGCACCTCGGA_2_1_1_1,Patient1,Malignant,746,,Tumor,10x,957,Patient1,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
2,AAAGCAAGTGTTAAGA_2_1_1_1,Patient1,Endothelial,1767,Not cycling,Tumor,10x,957,Patient1,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
3,AAAGTAGGTCCCTTGT_2_1_1_1,Patient1,Malignant,1481,Not cycling,Tumor,10x,957,Patient1,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
4,AAAGTAGGTCTCTTAT_2_1_1_1,Patient1,Fibroblast,1772,Not cycling,Tumor,10x,957,Patient1,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19776,TTTCGATGTAGTCCTA_13_1,Patient8,Endothelial,1582,Not cycling,Normal,10x,1433,Patient8,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
19777,TTTCGATTCCTTCTAA_13_1,Patient8,Endothelial,2245,Not cycling,Normal,10x,1433,Patient8,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
19778,TTTGGAGAGCCGCACT_13_1,Patient8,Endothelial,2122,Not cycling,Normal,10x,1433,Patient8,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney
19779,TTTGTTGAGTGGGAAA_13_1,Patient8,Endothelial,2702,Not cycling,Normal,10x,1433,Patient8,Clear Cell Renal Cell Carcinoma,,,,primary,kidney,naive,,Kidney,Data_Obradovic2021_Kidney


In [70]:
output_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Obradovic2021_Kidney.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Kidney/Data_Obradovic2021_Kidney.h5ad


#### 4. Young2018

In [71]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Young2018_Kidney"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [83]:
adata

AnnData object with n_obs × n_vars = 125139 × 33694
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'is_tumor', 'cell_compartment', 'cluster_assignment', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive', 'source'

In [84]:
adata.obs['source'].value_counts()

source
Indistinct    52638
normal        48347
tumor         24154
Name: count, dtype: int64

In [77]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'cell_QCpass', 'technology_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [82]:
adata.obs['source'] = adata.obs['is_tumor'].map({
    'NO': 'normal',
    'YES': 'tumor',
    'Indistinct': 'Indistinct'
})

In [86]:
del adata.obs['is_tumor']

In [88]:
del adata.obs['cell_compartment']

In [89]:
del adata.obs['cluster_assignment']

In [92]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,category,study
0,4602STDY7018923___AAACGGGGTCCAACTA,Wilms3_Kid_N_C_ldc_1_1,Epithelial,Nephron_epithelium,346,,10x,6595,Wilms3,Wilms Tumor,,2 years 6 months,,primary,kidney,,normal,Kidney,Data_Young2018_Kidney
1,4602STDY7018923___AAAGATGAGGAGTACC,Wilms3_Kid_N_C_ldc_1_1,Endothelial,Endothelium,1376,Not cycling,10x,6595,Wilms3,Wilms Tumor,,2 years 6 months,,primary,kidney,,normal,Kidney,Data_Young2018_Kidney
2,4602STDY7018923___AAAGATGGTGAACCTT,Wilms3_Kid_N_C_ldc_1_1,Epithelial,Nephron_epithelium,238,,10x,6595,Wilms3,Wilms Tumor,,2 years 6 months,,primary,kidney,,normal,Kidney,Data_Young2018_Kidney
3,4602STDY7018923___AAAGTAGTCCCTTGTG,Wilms3_Kid_N_C_ldc_1_1,Epithelial,Nephron_epithelium,3048,Not cycling,10x,6595,Wilms3,Wilms Tumor,,2 years 6 months,,primary,kidney,,normal,Kidney,Data_Young2018_Kidney
4,4602STDY7018923___AAATGCCCAGTAAGAT,Wilms3_Kid_N_C_ldc_1_1,Epithelial,Nephron_epithelium,2032,Not cycling,10x,6595,Wilms3,Wilms Tumor,,2 years 6 months,,primary,kidney,,normal,Kidney,Data_Young2018_Kidney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125134,4834STDY7002886___TCTCTAAGTGGACGAT,F17_Kid_N_neg_1_1,,,172,,10x,1732,,Normal,,,,,,,Indistinct,Kidney,Data_Young2018_Kidney
125135,4834STDY7002886___TTCTTAGAGTTTGCGT,F17_Kid_N_neg_1_1,,,151,,10x,1732,,Normal,,,,,,,Indistinct,Kidney,Data_Young2018_Kidney
125136,4834STDY7002886___TTGGCAATCAACACGT,F17_Kid_N_neg_1_1,,,198,,10x,1732,,Normal,,,,,,,Indistinct,Kidney,Data_Young2018_Kidney
125137,4834STDY7002886___TTTATGCGTAACGCGA,F17_Kid_N_neg_1_1,,,2175,,10x,1732,,Normal,,,,,,,Indistinct,Kidney,Data_Young2018_Kidney


In [90]:
adata.obs['category'] = 'Kidney'

In [91]:
adata.obs['study'] = 'Data_Young2018_Kidney'

In [93]:
output_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Young2018_Kidney.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Kidney/Data_Young2018_Kidney.h5ad


#### 5.Zhang202

In [5]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Zhang2021_Kidney"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect

# Final check
#print(adata)
#print(adata.obs.head())


<function gc.collect(generation=2)>

In [6]:
gc.collect()

28

In [7]:
adata

AnnData object with n_obs × n_vars = 29474 × 33694
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'disease', 'patient_y', 'n_cells', 'technology', 'cancer_type', 'sex', 'age', 'smoking_status', 'PY', 'diagnosis_recurrence', 'disease_extent', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage', 'sample_primary_met', 'size', 'site', 'histology', 'additional_tumor_characterisics', 'grade', 'KI67', 'treated_naive', 'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed', 'targeted_rx_response', 'ICB_exposed', 'ICB_response', 'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed', 'post_sampling_rx_response', 'PFS_DFS', 'OS', 'source'

In [97]:
adata.obs['source'].value_counts()

source
tumor     23328
normal     6146
Name: count, dtype: int64

In [8]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'patient_y', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS'         
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [15]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,n_cells,technology,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,source,study,category
0,SI_18854_AAACCTGCAAGTAGTA-1,SI_18854,SS_2005,Malignant,Malignant,3255,Not cycling,1665,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
1,SI_18854_AAACCTGTCCACTGGG-1,SI_18854,SS_2005,Malignant,Malignant,3807,G1/S,1665,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
2,SI_18854_AAACCTGTCCTTTCTC-1,SI_18854,SS_2005,Malignant,Malignant,2770,Not cycling,1665,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
3,SI_18854_AAACGGGCAAACTGCT-1,SI_18854,SS_2005,Macrophage,Macrophage,2185,Not cycling,1665,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
4,SI_18854_AAACGGGCAAGGTTTC-1,SI_18854,SS_2005,Malignant,Malignant,2741,Not cycling,1665,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29469,SI_23843_TTTGTCAAGTTGTCGT-1,SI_23843,SS_2026,Endothelial,Endo_ACKR1,1355,Not cycling,3551,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
29470,SI_23843_TTTGTCACAAGTCTAC-1,SI_23843,SS_2026,Macrophage,Macrophage,2326,Not cycling,3551,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
29471,SI_23843_TTTGTCAGTTCCGTCT-1,SI_23843,SS_2026,Endothelial,Endo_PLVAP,1603,Not cycling,3551,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney
29472,SI_23843_TTTGTCATCTCTTATG-1,SI_23843,SS_2026,Malignant,Malignant,3070,Not cycling,3551,10x,Clear Cell Renal Cell Carcinoma,Male,71,,primary,,,tumor,Data_Zhang2021_Kidney,Kidney


In [10]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [11]:
del adata.obs['disease']

In [12]:
del adata.obs['additional_tumor_characterisics']

In [13]:
adata.obs['study'] = 'Data_Zhang2021_Kidney'

In [14]:
adata.obs['category'] = 'Kidney'

In [16]:
output_path = "/home/ubuntu/Downloads/Data_Kidney/Data_Zhang2021_Kidney.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Kidney/Data_Zhang2021_Kidney.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [21]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Kidney/Data_Bi2021_Kidney.h5ad",
    "/home/ubuntu/Downloads/Data_Kidney/Data_Krishna2021_Kidney.h5ad",
    "/home/ubuntu/Downloads/Data_Kidney/Data_Obradovic2021_Kidney.h5ad",
    "/home/ubuntu/Downloads/Data_Kidney/Data_Young2018_Kidney.h5ad",
    "/home/ubuntu/Downloads/Data_Kidney/Data_Zhang2021_Kidney.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Kidney/Kidney_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Kidney/Kidney_Combined.h5ad
