In [1]:
import os
import pandas as pd
import scanpy as sc
import anndata
import gc
import scipy.io
from scipy.sparse import vstack

## Skin

#### 1. Jerby-Arnon2018_Skin

In [9]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Jerby-Arnon2018_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect()

33451

In [12]:
adata

AnnData object with n_obs × n_vars = 7186 × 23686
    obs: 'cell_name', 'sample', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'source', 'cell_cohort', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [14]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'cell_cohort'      
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [6]:
adata.obs['subtype'].value_counts()

subtype
DIF    30619
MES     4590
IMR     4078
PRO     1744
Name: count, dtype: int64

In [19]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category
0,cy78_CD45_neg_1_B04_S496_comb,Mel78,Malignant,Malignant,8258,Intermediate,post.treatment,SmartSeq2,124,Mel78,Melanoma,M,73,metastatic,met,Small bowel,Post-ICI (resistant),Jerby-Arnon2018_Skin,Skin
1,cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb,Mel79,Malignant,Malignant,2047,Not cycling,treatment.naive,SmartSeq2,894,Mel79,Melanoma,M,74,metastatic,met,Axillary lymph node,Untreated,Jerby-Arnon2018_Skin,Skin
2,CY88_5_B10_S694_comb,Mel88,Malignant,Malignant,5375,Not cycling,post.treatment,SmartSeq2,352,Mel88,Melanoma,F,54,metastatic,met,Cutanoues met,Post-ICI (resistant),Jerby-Arnon2018_Skin,Skin
3,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb,Mel79,Malignant,Malignant,5648,Not cycling,treatment.naive,SmartSeq2,894,Mel79,Melanoma,M,74,metastatic,met,Axillary lymph node,Untreated,Jerby-Arnon2018_Skin,Skin
4,cy78_CD45_neg_3_H06_S762_comb,Mel78,Malignant,Malignant,7409,Not cycling,post.treatment,SmartSeq2,124,Mel78,Melanoma,M,73,metastatic,met,Small bowel,Post-ICI (resistant),Jerby-Arnon2018_Skin,Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7181,CY75_1_CD45_CD8_3__S168_comb_BCD8,Mel75,T_cell,T_Cell_CD8,3530,Not cycling,post.treatment,SmartSeq2,506,Mel75,Melanoma,M,80,metastatic,met,Subcutaneous leg lesion,Post-ICI (resistant),Jerby-Arnon2018_Skin,Skin
7182,CY75_1_CD45_CD8_8__S338_comb_BCD8,Mel75,T_cell,T_Cell_CD8,3872,Not cycling,post.treatment,SmartSeq2,506,Mel75,Melanoma,M,80,metastatic,met,Subcutaneous leg lesion,Post-ICI (resistant),Jerby-Arnon2018_Skin,Skin
7183,monika_D7_S132_comb_BCD8_3,Mel75,T_cell,T_Cell_CD8,4589,Not cycling,post.treatment,SmartSeq2,506,Mel75,Melanoma,M,80,metastatic,met,Subcutaneous leg lesion,Post-ICI (resistant),Jerby-Arnon2018_Skin,Skin
7184,CY75_1_CD45_CD8_8__S289_comb_BCD8,Mel75,T_cell,T_Cell_CD8,4614,Not cycling,post.treatment,SmartSeq2,506,Mel75,Melanoma,M,80,metastatic,met,Subcutaneous leg lesion,Post-ICI (resistant),Jerby-Arnon2018_Skin,Skin


In [16]:
adata.obs['study'] = 'Jerby-Arnon2018_Skin'

In [17]:
adata.obs['category'] = 'Skin'

In [18]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Jerby-Arnon2018_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Jerby-Arnon2018_Skin.h5ad


#### 2.Biermann2022_Skin

In [20]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Biermann2022_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [23]:
adata

AnnData object with n_obs × n_vars = 136973 × 35652
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'disease', 'source', 'technology', 'n_cells', 'patient_y', 'cancer_type', 'sample_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'mutation_hormonal_status', 'treated_naive'

In [31]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'disease', 'patient_y', 'mutation_hormonal_status', 'sample_type'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [34]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive
0,CTCGTACCAAGCTGGA-1_1,MBM01_sc,MBM01,Plasma,Plasma cells,4017,Not cycling,Brain_Metastasis,10X,3515,Melanoma,,,,met,brain metastasis,naive
1,AGATTGCAGTGGGTTG-1_1,MBM01_sc,MBM01,Plasma,Plasma cells,4269,Not cycling,Brain_Metastasis,10X,3515,Melanoma,,,,met,brain metastasis,naive
2,TCGGGACTCAACACTG-1_1,MBM01_sc,MBM01,Malignant,Tumor cells,8319,G1/S,Brain_Metastasis,10X,3515,Melanoma,,,,met,brain metastasis,naive
3,CTCGGAGCAGCTATTG-1_1,MBM01_sc,MBM01,Malignant,Tumor cells,8229,Not cycling,Brain_Metastasis,10X,3515,Melanoma,,,,met,brain metastasis,naive
4,TGGGAAGAGAACAACT-1_1,MBM01_sc,MBM01,Plasma,Plasma cells,4396,Not cycling,Brain_Metastasis,10X,3515,Melanoma,,,,met,brain metastasis,naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136968,TACTGCCCATCGAGCC-1_32,MPM11_sn,MPM11,Malignant,Tumor cells,770,,Peripheral_Metastasis,10X,8789,Melanoma,,,,met,brain metastasis,treated
136969,GCACGTGTCGTAATGC-1_32,MPM11_sn,MPM11,Malignant,Tumor cells,718,,Peripheral_Metastasis,10X,8789,Melanoma,,,,met,brain metastasis,treated
136970,ATGGAGGAGTTTGTCG-1_32,MPM11_sn,MPM11,Malignant,Tumor cells,862,,Peripheral_Metastasis,10X,8789,Melanoma,,,,met,brain metastasis,treated
136971,ACCATTTTCCAAGCAT-1_32,MPM11_sn,MPM11,Malignant,Tumor cells,705,,Peripheral_Metastasis,10X,8789,Melanoma,,,,met,brain metastasis,treated


In [33]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [35]:
adata.obs['study'] = 'Biermann2022_Skin'

In [36]:
adata.obs['category'] = 'Skin'

In [39]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Biermann2022_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Biermann2022_Skin.h5ad


#### 3.Ferrari de Andrade2019_Skin

In [40]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Ferrari de Andrade2019_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [44]:
adata

AnnData object with n_obs × n_vars = 40823 × 24770
    obs: 'cell_name', 'sample', 'patient_x', 'source_x', 'cell_type', 'complexity', 'n_cells', 'cancer_type', 'technology', 'disease_extent', 'age', 'sex'

In [43]:
for col in ['patient_y', 'source_y',    
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [57]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cell_type,complexity,n_cells,cancer_type,technology,disease_extent,age,sex,cell_subtype,site,cell_cycle_phase,sample_primary_met,treated_naive,category,study
0,CY155_blood_AAACCTGCATGATCCA-1,CY155_blood,CY155,blood,NK_cell,1150,3169,Melanoma,10x,metastatic,66,Female,,blood,,,,Skin,Ferrari de Andrade2019_Skin
1,CY155_blood_AAACCTGGTAAGTGGC-1,CY155_blood,CY155,blood,NK_cell,1735,3169,Melanoma,10x,metastatic,66,Female,,blood,,,,Skin,Ferrari de Andrade2019_Skin
2,CY155_blood_AAACCTGGTACTTCTT-1,CY155_blood,CY155,blood,NK_cell,1117,3169,Melanoma,10x,metastatic,66,Female,,blood,,,,Skin,Ferrari de Andrade2019_Skin
3,CY155_blood_AAACGGGAGCTATGCT-1,CY155_blood,CY155,blood,NK_cell,2090,3169,Melanoma,10x,metastatic,66,Female,,blood,,,,Skin,Ferrari de Andrade2019_Skin
4,CY155_blood_AAACGGGAGCTTATCG-1,CY155_blood,CY155,blood,NK_cell,964,3169,Melanoma,10x,metastatic,66,Female,,blood,,,,Skin,Ferrari de Andrade2019_Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40818,CY129-2-Nodule_TTTGGTTAGTTGAGTA-1,CY129-2-Nodule,CY129,nodule,NK_cell,2427,1178,Melanoma,10x,metastatic,65,Male,,nodule,,,,Skin,Ferrari de Andrade2019_Skin
40819,CY129-2-Nodule_TTTGGTTTCAACCATG-1,CY129-2-Nodule,CY129,nodule,NK_cell,2270,1178,Melanoma,10x,metastatic,65,Male,,nodule,,,,Skin,Ferrari de Andrade2019_Skin
40820,CY129-2-Nodule_TTTGGTTTCGTTTGCC-1,CY129-2-Nodule,CY129,nodule,NK_cell,2094,1178,Melanoma,10x,metastatic,65,Male,,nodule,,,,Skin,Ferrari de Andrade2019_Skin
40821,CY129-2-Nodule_TTTGTCAAGTGCCAGA-1,CY129-2-Nodule,CY129,nodule,NK_cell,1668,1178,Melanoma,10x,metastatic,65,Male,,nodule,,,,Skin,Ferrari de Andrade2019_Skin


In [46]:
adata.obs['cell_subtype'] = 'NaN'

In [48]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [49]:
adata.obs['sample_primary_met'] = 'NaN'

In [50]:
adata.obs['site'] = adata.obs['source_x'].copy()

In [51]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [53]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [54]:
adata.obs['treated_naive'] = 'NaN'

In [55]:
adata.obs['category'] = 'Skin'

In [56]:
adata.obs['study'] = 'Ferrari de Andrade2019_Skin'

In [58]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Ferrari de Andrade2019_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Ferrari de Andrade2019_Skin.h5ad


#### 4. Ji2020_Skin

In [59]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Ji2020_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [62]:
adata

AnnData object with n_obs × n_vars = 48164 × 32738
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'source', 'cell_subtype_level1', 'cell_subtype_level2', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [64]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'technology_y', 'smoking_status', 'PY', 'patient_y',
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 'cell_subtype_level2'    
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [72]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,source,cell_subtype,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,category,study
0,P1_Tumor_AAACCTGAGTCAAGCG,P1,Malignant,5206,Not cycling,,Tumor_KC_Diff,10x,2382,P1,Cutaneous Squamous Cell Carcinoma,M,83,,primary,Rt. Dorsal Hand,naive,Skin,Ji2020_Skin
1,P1_Tumor_AAACCTGCAAATTGCC,P1,Epithelial,3468,Not cycling,,Keratinocyte,10x,2382,P1,Cutaneous Squamous Cell Carcinoma,M,83,,primary,Rt. Dorsal Hand,naive,Skin,Ji2020_Skin
2,P1_Tumor_AAACCTGGTAGGAGTC,P1,Malignant,4154,Not cycling,,Tumor_KC_Basal,10x,2382,P1,Cutaneous Squamous Cell Carcinoma,M,83,,primary,Rt. Dorsal Hand,naive,Skin,Ji2020_Skin
3,P1_Tumor_AAACGGGAGATGTAAC,P1,Malignant,4334,Not cycling,,Tumor_KC_Basal,10x,2382,P1,Cutaneous Squamous Cell Carcinoma,M,83,,primary,Rt. Dorsal Hand,naive,Skin,Ji2020_Skin
4,P1_Tumor_AAACGGGAGCTGCAAG,P1,Malignant,4822,Intermediate,,Tumor_KC_Cyc,10x,2382,P1,Cutaneous Squamous Cell Carcinoma,M,83,,primary,Rt. Dorsal Hand,naive,Skin,Ji2020_Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48159,P10_Normal_TTTGTCAAGAGTCTGG,P10,Epithelial,1659,Not cycling,,Normal_KC_Diff,10x,4569,P10,Cutaneous Squamous Cell Carcinoma,M,71,,primary,Rt. Tragus,,Skin,Ji2020_Skin
48160,P10_Normal_TTTGTCAAGTCGTTTG,P10,Langerhans,3541,G1/S,,Langerhans_cell,10x,4569,P10,Cutaneous Squamous Cell Carcinoma,M,71,,primary,Rt. Tragus,,Skin,Ji2020_Skin
48161,P10_Normal_TTTGTCACAGTGGGAT,P10,Langerhans,2540,Not cycling,,Langerhans_cell,10x,4569,P10,Cutaneous Squamous Cell Carcinoma,M,71,,primary,Rt. Tragus,,Skin,Ji2020_Skin
48162,P10_Normal_TTTGTCATCCAGGGCT,P10,Langerhans,1849,Not cycling,,Langerhans_cell,10x,4569,P10,Cutaneous Squamous Cell Carcinoma,M,71,,primary,Rt. Tragus,,Skin,Ji2020_Skin


In [71]:
adata.obs = adata.obs.rename(columns={"cell_subtype_level1": "cell_subtype"})

In [66]:
adata.obs['source'] = 'NaN'

In [67]:
adata.obs['category'] = 'Skin'

In [68]:
adata.obs['study'] = 'Ji2020_Skin'

In [73]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Ji2020_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Ji2020_Skin.h5ad


#### 5.Li2019_Skin

In [2]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Li2019_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [3]:
adata

AnnData object with n_obs × n_vars = 78336 × 40979
    obs: 'cell_name', 'sample', 'patient_x', 'amp_batch', 'cell_type', 'cell_subtype', 'location_x', 'stage_x', 'complexity', 'patient_y', 'cancer_type', 'technology', 'n_cells', 'location_y', 'stage_y', 'prior_treatment', 'on_treatment'

In [79]:
adata.obs['on_treatment'].value_counts()

on_treatment
aPD1    7457
Name: count, dtype: int64

In [19]:
for col in ['amp_batch','stage_x','location_y', 'stage_y', 'prior_treatment', 'on_treatment', 'patient_y'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [20]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,site,complexity,cancer_type,technology,n_cells,source,cell_cycle_phase,sex,age,disease_extent,sample_primary_met,treated_naive,study,category
0,W461969,,,,,,340,,,,,,,,,,,Li2019_Skin,Skin
1,W461970,p2,p2,T_cell,transitional,lymph node,591,Melanoma,MARS-seq,2270.0,lymph node,,,,,,,Li2019_Skin,Skin
2,W461971,p2,p2,T_cell,transitional,lymph node,1074,Melanoma,MARS-seq,2270.0,lymph node,,,,,,,Li2019_Skin,Skin
3,W461972,p2,p2,T_cell,transitional,lymph node,620,Melanoma,MARS-seq,2270.0,lymph node,,,,,,,Li2019_Skin,Skin
4,W461973,p2,p2,T_cell,transitional,lymph node,390,Melanoma,MARS-seq,2270.0,lymph node,,,,,,,Li2019_Skin,Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78331,WMC810396,,,,,,640,,,,,,,,,,,Li2019_Skin,Skin
78332,WMC810397,,,,,,702,,,,,,,,,,,Li2019_Skin,Skin
78333,WMC810398,,,,,,563,,,,,,,,,,,Li2019_Skin,Skin
78334,WMC810399,,,,,,2106,,,,,,,,,,,Li2019_Skin,Skin


In [6]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [7]:
adata.obs = adata.obs.rename(columns={"location_x": "site"})

In [8]:
adata.obs['source']=adata.obs['site'].copy()

In [9]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [10]:
adata.obs['sex'] = 'NaN'

In [11]:
adata.obs['age'] = 'NaN'

In [12]:
adata.obs['disease_extent'] = 'NaN'

In [13]:
adata.obs['sample_primary_met'] = 'NaN'

In [14]:
adata.obs['treated_naive'] = 'NaN'

In [15]:
adata.obs['study'] = 'Li2019_Skin'

In [16]:
adata.obs['category'] = 'Skin'

In [21]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Li2019_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Li2019_Skin.h5ad


#### 6.Mahuron2020_Skin

In [22]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Mahuron2020_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [24]:
adata

AnnData object with n_obs × n_vars = 27935 × 24892
    obs: 'cell_name', 'sample', 'patient_x', 'source_x', 'cell_type', 'complexity', 'patient_y', 'cancer_type', 'technology', 'n_cells', 'source_y', 'location', 'age', 'sex', 'disease_extent', 'treated_naive', 'sample_primary_met'

In [26]:
for col in ['patient_y',  'source_y',      
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [30]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cell_type,complexity,cancer_type,technology,n_cells,site,age,sex,disease_extent,treated_naive,sample_primary_met
0,K383_LN_GEX_AAACCTGAGGACGAAA-1,K383_LN_GEX,K383,lymph node,T_cell,1268,Melanoma,10x,3116,inguinal LN,45,male,metastatic,naive,
1,K383_LN_GEX_AAACCTGAGGGTGTTG-1,K383_LN_GEX,K383,lymph node,T_cell,2735,Melanoma,10x,3116,inguinal LN,45,male,metastatic,naive,
2,K383_LN_GEX_AAACCTGCAAGGACTG-1,K383_LN_GEX,K383,lymph node,T_cell,1186,Melanoma,10x,3116,inguinal LN,45,male,metastatic,naive,
3,K383_LN_GEX_AAACCTGCAATCTGCA-1,K383_LN_GEX,K383,lymph node,B_cell,818,Melanoma,10x,3116,inguinal LN,45,male,metastatic,naive,
4,K383_LN_GEX_AAACCTGCACTGTTAG-1,K383_LN_GEX,K383,lymph node,B_cell,1778,Melanoma,10x,3116,inguinal LN,45,male,metastatic,naive,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27930,K411_LN_GEX_TTTGTCAGTTGGTAAA-1,K411_LN_GEX,K411,lymph node,T_cell,1650,Melanoma,10x,4032,neck LN,58,female,metastatic,naive,
27931,K411_LN_GEX_TTTGTCATCACAAACC-1,K411_LN_GEX,K411,lymph node,T_cell,1557,Melanoma,10x,4032,neck LN,58,female,metastatic,naive,
27932,K411_LN_GEX_TTTGTCATCTCGATGA-1,K411_LN_GEX,K411,lymph node,T_cell,2281,Melanoma,10x,4032,neck LN,58,female,metastatic,naive,
27933,K411_LN_GEX_TTTGTCATCTGACCTC-1,K411_LN_GEX,K411,lymph node,T_cell,1139,Melanoma,10x,4032,neck LN,58,female,metastatic,naive,


In [27]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [28]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [29]:
adata.obs = adata.obs.rename(columns={"location": "site"})

In [31]:
adata.obs['cell_subtype'] = 'NaN'

In [32]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [33]:
adata.obs['category'] = 'Skin'

In [34]:
adata.obs['study'] = 'Mahuron2020_Skin'

In [35]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Mahuron2020_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Mahuron2020_Skin.h5ad


#### 7.Paulson2020_Skin

In [36]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Paulson2020_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_UMIcounts.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [42]:
adata

AnnData object with n_obs × n_vars = 31376 × 16373
    obs: 'cell_name', 'sample', 'patient_x', 'source_x', 'cell_type', 'complexity', 'technology', 'n_cells', 'cancer_type', 'disease_extent', 'response', 'age', 'sex'

In [41]:
for col in ['timepoint_x', 'patient_y', 'source_y', 'timepoint_y',    
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [43]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [44]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [45]:
adata.obs = adata.obs.rename(columns={"response": "treated_naive"})

In [53]:
adata.obs

Unnamed: 0,cell_name,sample,patient,source,cell_type,complexity,technology,n_cells,cancer_type,disease_extent,treated_naive,age,sex,cell_subtype,cell_cycle_phase,sample_primary_met,site,study,category
0,2586-4_Tumor_Before_AAACCTGAGGATGTAT-1,2586-4_Tumor_Before,2586-4,Tumor,Malignant,1125,10x,2243,Merkel Cell Carcinoma,metastatic,,59,male,,,,Skin,Paulson2020_Skin,Skin
1,2586-4_Tumor_Before_AAACCTGCAGCGATCC-1,2586-4_Tumor_Before,2586-4,Tumor,Malignant,1537,10x,2243,Merkel Cell Carcinoma,metastatic,,59,male,,,,Skin,Paulson2020_Skin,Skin
2,2586-4_Tumor_Before_AAACCTGGTACGAAAT-1,2586-4_Tumor_Before,2586-4,Tumor,Malignant,854,10x,2243,Merkel Cell Carcinoma,metastatic,,59,male,,,,Skin,Paulson2020_Skin,Skin
3,2586-4_Tumor_Before_AAACGGGAGCTGGAAC-1,2586-4_Tumor_Before,2586-4,Tumor,Malignant,819,10x,2243,Merkel Cell Carcinoma,metastatic,,59,male,,,,Skin,Paulson2020_Skin,Skin
4,2586-4_Tumor_Before_AAACGGGAGGAGTTGC-1,2586-4_Tumor_Before,2586-4,Tumor,Malignant,1131,10x,2243,Merkel Cell Carcinoma,metastatic,,59,male,,,,Skin,Paulson2020_Skin,Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31371,9245-3_Tumor_TTTGTCAGTCGGCTCA-2,9245-3_Tumor,9245-3,Tumor,,2602,10x,5323,Merkel Cell Carcinoma,metastatic,Acquired resistance,59,male,,,,Skin,Paulson2020_Skin,Skin
31372,9245-3_Tumor_TTTGTCAGTGCAGACA-2,9245-3_Tumor,9245-3,Tumor,Malignant,4309,10x,5323,Merkel Cell Carcinoma,metastatic,Acquired resistance,59,male,,,,Skin,Paulson2020_Skin,Skin
31373,9245-3_Tumor_TTTGTCAGTGGTTTCA-2,9245-3_Tumor,9245-3,Tumor,Malignant,4978,10x,5323,Merkel Cell Carcinoma,metastatic,Acquired resistance,59,male,,,,Skin,Paulson2020_Skin,Skin
31374,9245-3_Tumor_TTTGTCAGTTCCCGAG-2,9245-3_Tumor,9245-3,Tumor,Malignant,3291,10x,5323,Merkel Cell Carcinoma,metastatic,Acquired resistance,59,male,,,,Skin,Paulson2020_Skin,Skin


In [47]:
adata.obs['cell_subtype'] = 'NaN'

In [48]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [49]:
adata.obs['sample_primary_met'] = 'NaN'

In [50]:
adata.obs['site'] = 'Skin'

In [51]:
adata.obs['study'] = 'Paulson2020_Skin'

In [52]:
adata.obs['category'] = 'Skin'

In [54]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Paulson2020_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Paulson2020_Skin.h5ad


#### 8.Sade-Feldman2018_Skin

In [56]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Sade-Feldman2018_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [63]:
adata

AnnData object with n_obs × n_vars = 16291 × 50513
    obs: 'cell_name', 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'n_cells', 'technology', 'cancer_type', 'sex', 'age', 'disease_extent', 'source', 'site', 'treatment'

In [62]:
for col in ['timepoint_x', 'cluster', 'cluster_cd8_2', 'cluster_cd8_6', 'patient_y', 'timepoint_y',
            'days_from_baseline', 'response_lesion', 'response_patient',
            'overall_survival_days', 'vital_status', 'mutations_indels_drivers', 'mutations_indels_immune'
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [71]:
adata.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,n_cells,technology,cancer_type,sex,age,disease_extent,source,site,treated_naive,cell_cycle_phase,sample_primary_met,category,study
0,A10_P3_M11,Pre_P1,P1,T_cell,T cell,2050,229,Smart-seq2,Melanoma,M,49,metastatic,tumor,right chest,anti-CTLA4,,,Skin,Sade-Feldman2018_Skin
1,A11_P1_M11,Pre_P1,P1,T_cell,T cell,1573,229,Smart-seq2,Melanoma,M,49,metastatic,tumor,right chest,anti-CTLA4,,,Skin,Sade-Feldman2018_Skin
2,A11_P3_M11,Pre_P1,P1,T_cell,T cell,1591,229,Smart-seq2,Melanoma,M,49,metastatic,tumor,right chest,anti-CTLA4,,,Skin,Sade-Feldman2018_Skin
3,A11_P4_M11,Pre_P1,P1,T_cell,Regulatory T cell,2909,229,Smart-seq2,Melanoma,M,49,metastatic,tumor,right chest,anti-CTLA4,,,Skin,Sade-Feldman2018_Skin
4,A12_P3_M11,Pre_P1,P1,T_cell,T cell,1211,229,Smart-seq2,Melanoma,M,49,metastatic,tumor,right chest,anti-CTLA4,,,Skin,Sade-Feldman2018_Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16286,H5_P5_M67_L001_T_enriched,Post_P6,P6,T_cell,Exhausted T cell,4355,400,Smart-seq2,Melanoma,F,66,metastatic,tumor,right colectomy mass in cecum,anti-PD1,,,Skin,Sade-Feldman2018_Skin
16287,H6_P5_M67_L001_T_enriched,Post_P6,P6,T_cell,Exhausted T cell,4948,400,Smart-seq2,Melanoma,F,66,metastatic,tumor,right colectomy mass in cecum,anti-PD1,,,Skin,Sade-Feldman2018_Skin
16288,H7_P5_M67_L001_T_enriched,Post_P6,P6,T_cell,Exhausted T cell,3208,400,Smart-seq2,Melanoma,F,66,metastatic,tumor,right colectomy mass in cecum,anti-PD1,,,Skin,Sade-Feldman2018_Skin
16289,H8_P5_M67_L001_T_enriched,Post_P6,P6,T_cell,Exhausted T cell,1355,400,Smart-seq2,Melanoma,F,66,metastatic,tumor,right colectomy mass in cecum,anti-PD1,,,Skin,Sade-Feldman2018_Skin


In [65]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [66]:
adata.obs = adata.obs.rename(columns={"treatment": "treated_naive"})

In [67]:
adata.obs['cell_cycle_phase'] = 'NaN'

In [68]:
adata.obs['sample_primary_met'] = 'NaN'

In [69]:
adata.obs['category'] = 'Skin'

In [70]:
adata.obs['study'] = 'Sade-Feldman2018_Skin'

In [72]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Sade-Feldman2018_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Sade-Feldman2018_Skin.h5ad


#### 9.Tirosh2016_Skin

In [73]:

# Base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Tirosh2016_Skin"

# Step 1: Read expression matrix
adata = sc.read_mtx(os.path.join(base_path, "Exp_data_TPM.mtx"))
adata = adata.transpose()  # Transpose to shape: cells × genes

# Step 2: Add gene names
genes = pd.read_csv(os.path.join(base_path, "Genes.txt"), header=None)
adata.var_names = genes[0].values
adata.var_names_make_unique()

# Step 3: Read and merge cell + sample metadata
cells = pd.read_csv(os.path.join(base_path, "Cells.csv"))
samples = pd.read_csv(os.path.join(base_path, "Samples.csv"))

# Merge on the shared 'sample' column
cells_merged = cells.merge(samples, on="sample", how="left")

# Assign merged metadata to AnnData
adata.obs = cells_merged

gc.collect


<function gc.collect(generation=2)>

In [76]:
adata

AnnData object with n_obs × n_vars = 4645 × 23686
    obs: 'cell_name', 'sample', 'cell_type', 'complexity', 'cell_cycle_phase', 'technology', 'n_cells', 'patient', 'cancer_type', 'sex', 'age', 'disease_extent', 'sample_primary_met', 'site', 'treated_naive'

In [75]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment', 'smoking_status', 'PY', 
            'diagnosis_recurrence', 'AJCC_T', 'AJCC_N', 'AJCC_M', 'AJCC_stage',
            'size',  'histology', 'genetic_hormonal_features', 'grade', 'KI67',
            'chemotherapy_exposed', 'chemotherapy_response', 'targeted_rx_exposed',
            'targeted_rx_response', 'ICB_exposed', 'ICB_response',
            'ET_exposed', 'ET_response', 'time_end_of_rx_to_sampling', 'post_sampling_rx_exposed',
            'post_sampling_rx_response', 'PFS_DFS', 'OS', 
           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [82]:
adata.obs

Unnamed: 0,cell_name,sample,cell_type,complexity,cell_cycle_phase,technology,n_cells,patient,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,cell_subtype,source,category,study
0,Cy72_CD45_H02_S758_comb,72,B_cell,3365,Not cycling,SmartSeq2,181,72,Melanoma,F,,metastatic,met,External iliac lymph node,treated,,,Skin,Tirosh2016_Skin
1,CY58_1_CD45_B02_S974_comb,58,T_cell,3637,Not cycling,SmartSeq2,142,58,Melanoma,F,,metastatic,met,Subcutaneous leg lesion,treated,,,Skin,Tirosh2016_Skin
2,Cy71_CD45_D08_S524_comb,71,Malignant,4660,Not cycling,SmartSeq2,89,71,Melanoma,M,,metastatic,met,Transverse colon,naive,,,Skin,Tirosh2016_Skin
3,Cy81_FNA_CD45_B01_S301_comb,81,Malignant,6387,Not cycling,SmartSeq2,205,81,Melanoma,F,,metastatic,met,Axillary lymph node,naive,,,Skin,Tirosh2016_Skin
4,Cy80_II_CD45_B07_S883_comb,80,Malignant,5913,Not cycling,SmartSeq2,480,80,Melanoma,F,,metastatic,met,Axillary lymph node,naive,,,Skin,Tirosh2016_Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4640,CY75_1_CD45_CD8_7__S223_comb,75,T_cell,5385,Not cycling,SmartSeq2,344,75,Melanoma,M,,metastatic,met,Subcutaneous leg lesion,treated,,,Skin,Tirosh2016_Skin
4641,CY75_1_CD45_CD8_1__S65_comb,75,T_cell,4769,Not cycling,SmartSeq2,344,75,Melanoma,M,,metastatic,met,Subcutaneous leg lesion,treated,,,Skin,Tirosh2016_Skin
4642,CY75_1_CD45_CD8_1__S93_comb,75,T_cell,4727,Not cycling,SmartSeq2,344,75,Melanoma,M,,metastatic,met,Subcutaneous leg lesion,treated,,,Skin,Tirosh2016_Skin
4643,CY75_1_CD45_CD8_1__S76_comb,75,T_cell,4976,Not cycling,SmartSeq2,344,75,Melanoma,M,,metastatic,met,Subcutaneous leg lesion,treated,,,Skin,Tirosh2016_Skin


In [78]:
adata.obs['cell_subtype'] = 'NaN'

In [79]:
adata.obs['source'] = 'NaN'

In [80]:
adata.obs['category'] = 'Skin'

In [81]:
adata.obs['study'] = 'Tirosh2016_Skin'

In [83]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Tirosh2016_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Tirosh2016_Skin.h5ad


#### 10.Yost2019_Skin

In [86]:
import os
import pandas as pd
import scanpy as sc

# Set base path
base_path = "/home/ubuntu/Downloads/Data_Skin/Data_Yost2019_Skin"

# Define cancer type folders and labels
cancer_types = {
    "BCC": "Basal Cell Carcinoma",
    "SCC": "Squamous Cell Carcinoma"
}

# Load sample metadata
samples_df = pd.read_csv(os.path.join(base_path, "Samples.csv")).drop_duplicates(subset=['sample'])

adatas = []

for folder, label in cancer_types.items():
    path = os.path.join(base_path, folder)
    
    # Load expression matrix
    adata = sc.read_mtx(os.path.join(path, "Exp_data_UMIcounts.mtx")).T  # Transpose to cells x genes
    
    # Load gene and cell information
    genes = pd.read_csv(os.path.join(path, "Genes.txt"), header=None)[0].tolist()
    cells = pd.read_csv(os.path.join(path, "Cells.csv"), index_col=0)

    # Assign gene names and cell metadata
    adata.var_names = genes
    adata.obs = cells
    adata.obs['cancer_folder'] = folder
    adata.obs['cancer_type'] = label
    
    adatas.append(adata)

# Find common genes
common_genes = adatas[0].var_names
for ad in adatas[1:]:
    common_genes = common_genes.intersection(ad.var_names)

# Subset to common genes
adatas = [ad[:, common_genes].copy() for ad in adatas]

# Concatenate all datasets
adata_combined = adatas[0].concatenate(
    *adatas[1:],
    batch_key='batch',
    batch_categories=list(cancer_types.keys()),
    index_unique=None
)

# Ensure unique cell names
assert adata_combined.obs_names.is_unique, "Cell names are not unique after concatenation"

# Merge sample metadata
adata_combined.obs = adata_combined.obs.reset_index()
original_index = adata_combined.obs.columns[0]

# Merge by 'sample' column
adata_combined.obs = adata_combined.obs.merge(samples_df, how='left', on='sample')
assert adata_combined.shape[0] == adata_combined.obs.shape[0], "Mismatch after sample merge"

# Restore index
adata_combined.obs = adata_combined.obs.set_index(original_index)
adata_combined.obs.index.name = None


  adata_combined = adatas[0].concatenate(


In [88]:
adata = adata_combined

In [93]:
adata

AnnData object with n_obs × n_vars = 79046 × 18189
    obs: 'sample', 'patient_x', 'cell_type', 'cell_subtype', 'complexity', 'cell_cycle_phase', 'treatment_x', 'source_x', 'sort', 'cancer_folder', 'cancer_type_x', 'technology', 'n_cells'

In [95]:
for col in ['umap1', 'umap2', 'g1s_score', 'g2m_score', 'mp_top_score', 'mp_top', 
            'mp_assignment',  'disease', 'time_point_x', 'batch', 'patient_y', 'cancer_type_y',
            'time_point_y', 'treatment_y', 'source_y','sort', 'cancer_folder'

           ]:
    if col in adata.obs.columns:
        del adata.obs[col]

In [113]:
adata.obs

Unnamed: 0,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,treated_naive,source,cancer_type,technology,n_cells,cell_name,sex,age,disease_extent,sample_primary_met,site,category,study
bcc.su001.post.tcell_AAACCTGAGCTTCGCG,su001_post,su001,T_cell,CD8_mem_T_cell,1390,Not cycling,Pembrolizumab,L arm,Basal Cell Carcinoma,10x,10429,bcc.su001.post.tcell_AAACCTGAGCTTCGCG,,,,,L arm tumor,Skin,Yost2019_Skin
bcc.su001.post.tcell_AAACCTGAGGACATTA,su001_post,su001,T_cell,CD8_ex_T_cell,1887,Not cycling,Pembrolizumab,L arm,Basal Cell Carcinoma,10x,10429,bcc.su001.post.tcell_AAACCTGAGGACATTA,,,,,L arm tumor,Skin,Yost2019_Skin
bcc.su001.post.tcell_AAACCTGCACGCATCG,su001_post,su001,T_cell,CD8_mem_T_cell,1505,Not cycling,Pembrolizumab,L arm,Basal Cell Carcinoma,10x,10429,bcc.su001.post.tcell_AAACCTGCACGCATCG,,,,,L arm tumor,Skin,Yost2019_Skin
bcc.su001.post.tcell_AAACCTGCAGATGGGT,su001_post,su001,T_cell,CD8_mem_T_cell,1748,Not cycling,Pembrolizumab,L arm,Basal Cell Carcinoma,10x,10429,bcc.su001.post.tcell_AAACCTGCAGATGGGT,,,,,L arm tumor,Skin,Yost2019_Skin
bcc.su001.post.tcell_AAACCTGCAGTGGAGT,su001_post,su001,T_cell,Treg,1662,Not cycling,Pembrolizumab,L arm,Basal Cell Carcinoma,10x,10429,bcc.su001.post.tcell_AAACCTGCAGTGGAGT,,,,,L arm tumor,Skin,Yost2019_Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scc.su014.pre_TTTGTCAAGTGTTGAA,su014_pre,su014,T_cell,Th17,3352,Not cycling,,R neck,Squamous Cell Carcinoma,10x,2808,scc.su014.pre_TTTGTCAAGTGTTGAA,,,,,R neck tumor,Skin,Yost2019_Skin
scc.su014.pre_TTTGTCACAAATACAG,su014_pre,su014,T_cell,Th17,1730,Not cycling,,R neck,Squamous Cell Carcinoma,10x,2808,scc.su014.pre_TTTGTCACAAATACAG,,,,,R neck tumor,Skin,Yost2019_Skin
scc.su014.pre_TTTGTCAGTCTAACGT,su014_pre,su014,T_cell,CD8_eff_T_cell,1531,Not cycling,,R neck,Squamous Cell Carcinoma,10x,2808,scc.su014.pre_TTTGTCAGTCTAACGT,,,,,R neck tumor,Skin,Yost2019_Skin
scc.su014.pre_TTTGTCAGTTCAGTAC,su014_pre,su014,T_cell,Treg,2619,Not cycling,,R neck,Squamous Cell Carcinoma,10x,2808,scc.su014.pre_TTTGTCAGTTCAGTAC,,,,,R neck tumor,Skin,Yost2019_Skin


In [102]:
adata.obs['cell_name'] = adata.obs_names

In [97]:
adata.obs = adata.obs.rename(columns={"patient_x": "patient"})

In [98]:
adata.obs = adata.obs.rename(columns={"source_x": "source"})

In [99]:
adata.obs = adata.obs.rename(columns={"cancer_type_x": "cancer_type"})

In [100]:
adata.obs = adata.obs.rename(columns={"treatment_x": "treated_naive"})

In [104]:
adata.obs['sex'] = 'NaN'

In [105]:
adata.obs['age'] = 'NaN'

In [106]:
adata.obs['disease_extent'] = 'NaN'

In [107]:
adata.obs['sample_primary_met'] = 'NaN'

In [109]:
adata_combined.obs['site'] = adata_combined.obs['source'].astype(str) + " tumor"

In [111]:
adata.obs['category'] = 'Skin'

In [112]:
adata.obs['study'] = 'Yost2019_Skin'

In [114]:
output_path = "/home/ubuntu/Downloads/Data_Skin/Data_Yost2019_Skin.h5ad"
adata.write(output_path)
print(f"✅ data saved to: {output_path}")

✅ data saved to: /home/ubuntu/Downloads/Data_Skin/Data_Yost2019_Skin.h5ad


#### Data Merging

In [19]:
import scanpy as sc
import anndata
import os

In [115]:

# Define file paths
files = [
    "/home/ubuntu/Downloads/Data_Skin/Data_Biermann2022_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Ferrari de Andrade2019_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Jerby-Arnon2018_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Ji2020_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Li2019_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Mahuron2020_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Paulson2020_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Sade-Feldman2018_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Tirosh2016_Skin.h5ad",
    "/home/ubuntu/Downloads/Data_Skin/Data_Yost2019_Skin.h5ad"
]

# Load datasets
adatas = [sc.read(file) for file in files]

gc.collect()
# Merge all AnnData objects
adata_merged = anndata.concat(adatas, join="outer", fill_value=0)

gc.collect()
# Fix non-string columns (e.g. 'sample') to avoid h5py write errors
for col in adata_merged.obs.columns:
    if adata_merged.obs[col].dtype == 'object':
        adata_merged.obs[col] = adata_merged.obs[col].astype(str)

gc.collect()
# Save merged dataset
output_path = "/home/ubuntu/Downloads/Data_Skin/Skin_Combined.h5ad"
adata_merged.write(output_path)

print(f"✅ Merged and saved to: {output_path}")


  utils.warn_names_duplicates("obs")


✅ Merged and saved to: /home/ubuntu/Downloads/Data_Skin/Skin_Combined.h5ad


In [117]:
adata_merged.obs

Unnamed: 0,cell_name,sample,patient,cell_type,cell_subtype,complexity,cell_cycle_phase,source,technology,n_cells,cancer_type,sex,age,disease_extent,sample_primary_met,site,treated_naive,study,category
0,CTCGTACCAAGCTGGA-1_1,MBM01_sc,MBM01,Plasma,Plasma cells,4017,Not cycling,Brain_Metastasis,10X,3515.0,Melanoma,,,,met,brain metastasis,naive,Biermann2022_Skin,Skin
1,AGATTGCAGTGGGTTG-1_1,MBM01_sc,MBM01,Plasma,Plasma cells,4269,Not cycling,Brain_Metastasis,10X,3515.0,Melanoma,,,,met,brain metastasis,naive,Biermann2022_Skin,Skin
2,TCGGGACTCAACACTG-1_1,MBM01_sc,MBM01,Malignant,Tumor cells,8319,G1/S,Brain_Metastasis,10X,3515.0,Melanoma,,,,met,brain metastasis,naive,Biermann2022_Skin,Skin
3,CTCGGAGCAGCTATTG-1_1,MBM01_sc,MBM01,Malignant,Tumor cells,8229,Not cycling,Brain_Metastasis,10X,3515.0,Melanoma,,,,met,brain metastasis,naive,Biermann2022_Skin,Skin
4,TGGGAAGAGAACAACT-1_1,MBM01_sc,MBM01,Plasma,Plasma cells,4396,Not cycling,Brain_Metastasis,10X,3515.0,Melanoma,,,,met,brain metastasis,naive,Biermann2022_Skin,Skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scc.su014.pre_TTTGTCAAGTGTTGAA,scc.su014.pre_TTTGTCAAGTGTTGAA,su014_pre,su014,T_cell,Th17,3352,Not cycling,R neck,10x,2808.0,Squamous Cell Carcinoma,,,,,R neck tumor,,Yost2019_Skin,Skin
scc.su014.pre_TTTGTCACAAATACAG,scc.su014.pre_TTTGTCACAAATACAG,su014_pre,su014,T_cell,Th17,1730,Not cycling,R neck,10x,2808.0,Squamous Cell Carcinoma,,,,,R neck tumor,,Yost2019_Skin,Skin
scc.su014.pre_TTTGTCAGTCTAACGT,scc.su014.pre_TTTGTCAGTCTAACGT,su014_pre,su014,T_cell,CD8_eff_T_cell,1531,Not cycling,R neck,10x,2808.0,Squamous Cell Carcinoma,,,,,R neck tumor,,Yost2019_Skin,Skin
scc.su014.pre_TTTGTCAGTTCAGTAC,scc.su014.pre_TTTGTCAGTTCAGTAC,su014_pre,su014,T_cell,Treg,2619,Not cycling,R neck,10x,2808.0,Squamous Cell Carcinoma,,,,,R neck tumor,,Yost2019_Skin,Skin
