In [2]:
import pandas as pd

In [3]:
anatomical_mapping = {
    # Head and neck region
    'Head and neck': 'head-and-neck',
    'Tongue': 'head-and-neck',
    'Tongue/Tonsil': 'head-and-neck',
    'Laryngeal tissue': 'head-and-neck',
    'Hypopharynx': 'head-and-neck',
    'Epiglottis': 'head-and-neck',
    
    # Brain and neural tissue
    'Brain': 'brain',
    'Cerebellum': 'brain',
    'Spinal cord': 'brain',
    'Nerve': 'brain',
    'Retina': 'brain',
    
    # Lung
    'Lung': 'lung',
    
    # Liver and biliary system
    'Liver': 'liverbiliary',
    'Gall bladder': 'liverbiliary',
    
    # Kidney
    'Kidney': 'kidney',
    
    # Prostate
    'Prostate': 'prostate',
    
    # Bone and soft tissue
    'Bone': 'sarcoma',
    'Hip': 'sarcoma',
    
    # Brain and neural tissue
    'Brain': 'brain',
    
    # Breast
    'Breast': 'breast',
    
    # Pancreas
    'Pancreas': 'pancreas',
    
    # Neuroendocrine tissues
    'Thyroid': 'neuroendocrine',
    'Adrenal gland': 'neuroendocrine',
    
    # Colorectal system
    'Colon': 'colorectal',
    'Colorectum': 'colorectal',
    'Sigmoid': 'colorectal',
    'Rectum': 'colorectal',
    
    # Ovarian
    'Ovary': 'ovarian',
    
    # Skin
    'Skin': 'skin',
    'Glabrous acral skin': 'skin',
    
    # Blood and bone marrow
    'Blood': 'hematologic',
    'Bone marrow': 'hematologic',
    
    # Items mapped to othermodels (no clear corresponding category)
    'Small intestine': 'othermodels',
    'Left arm': 'othermodels',
    'Endometrium': 'othermodels',
    'Bladder': 'othermodels',
    'Esophagus': 'othermodels',
    'Testis': 'othermodels',
    'Uterus': 'othermodels',
    'Mesothelium': 'othermodels',
    'Appendix': 'othermodels',
    'Cervix': 'othermodels'
}

technology_mapping = {
    '10X Genomics': '10x',  # Standard naming variation
    'Drop-seq': 'Drop-seq',  # Exact match
    'Smart-seq2': 'SmartSeq2',  # Minor formatting difference
    'Seq-Well': 'Seq-Well',  # Exact match
    'Microwell': 'Microwell-seq',  # Most similar match
    
}

cancer_mapping = {
    # Direct matches
    'Merkel Cell Carcinoma': 'Merkel cell carcinoma',
    'Lung Adenocarcinoma': 'Lung adenocarcinoma',
    'Non-small Cell Lung Cancer': 'NSCLC',
    'Neuroblastoma': 'Neuroblastoma',
    'Breast Cancer': 'Breast cancer',
    'Colorectal Cancer': 'Colorectal cancer',
    'Triple Negative Breast Cancer': 'TNBC',
    'Acute Myeloid Leukemia': 'AML',
    'Melanoma': 'Melanoma',
    'Cutaneous Melanoma': 'Primary melanoma',
    'Prostate Cancer': 'Prostate cancer',
    'Clear Cell Renal Cell Carcinoma': 'RCC',
    'Hepatocellular Cancer': 'HCC',
    'Wilms Tumor': 'Wilms tumor',
    'Glioblastoma': 'GBM',
    'Medulloblastoma': 'Medulloblastoma',
    'Pediatric Ependymoma': 'Pediatric ependymoma',
    'Head and Neck Squamous Cell Carcinoma': 'HNSCC',
    'Esophageal Squamous Cell Carcinoma': 'ESCC',
    'Pancreatic Ductal Adenocarcinoma': 'PDAC',
    'High-grade Serous Ovarian Carcinoma': 'HGSOC',
    'Ovarian Carcinoma': 'Ovarian cancer',
    'Cutaneous Squamous Cell Carcinoma': 'Skin SCC',
    'Basal Cell Carcinoma': 'BCC',
    'Neuroendocrine Tumor': 'Small intestinal neuroendocrine tumor'
}

In [53]:
ccca = pd.read_csv("3ca.csv", index_col=0)
scem = pd.read_csv("scem.csv")
scem = scem[scem["year"].notna()]

In [54]:
scem = scem.rename(columns={"url": "Article", "n_samples": "N_samples"})
ccca = ccca.rename(columns={"N_sampels": "N_samples"})

In [55]:
scem["Source"] = "CancerSCEM"
ccca["Source"] = "3ca"

In [56]:
scem["Tissue"] = scem["primary_site"].map(anatomical_mapping.get)

In [57]:
scem["Technology"] = scem["protocol"].map(lambda x: technology_mapping.get(x, x))

In [58]:
ccca["author_last_name"] = ccca["Title"].str.split(" et al. ").str[0]
ccca["year"] = ccca["Title"].str.split(" et al. ").str[1]


In [72]:
scem["Disease"] = scem["cancer_type"].map(lambda x: cancer_mapping.get(x, x))


In [87]:
ccca["Project ID"] = [f"CCCA-{i+1}" for i in range(ccca.shape[0])]


In [88]:
ccca.index = ccca["author_last_name"] + "_" + ccca["year"].astype(str) + "_" + ccca["Tissue"]
scem.index = scem["author_last_name"] + "_" + scem["year"].astype(int).astype(str) + "_" + scem["Tissue"]


In [99]:
combined = pd.concat([ccca, scem])[['Tissue', 'Article','Disease',
       'Technology', 'N_samples', 'Source', 'author_last_name',
       'year', 'Project ID']]

combined["year"] = combined["year"].astype(int)
combined["N_samples"] = combined["N_samples"].astype(int)
combined.to_csv("studies.csv")

In [98]:
combined[["N_samples"]].value_counts().sort_index(ascending=True)

N_samples
1             1
2            21
3            19
4            20
5             9
6            15
7            10
8            13
9             1
10           12
11            6
12            6
13            7
14            8
15           10
16            6
17            3
18            9
19            1
20            1
21            1
22            2
23            1
24            3
25            2
26            3
29            3
30            1
31            2
32            3
33            2
36            2
37            2
38            1
40            3
41            1
42            3
43            1
45            1
48            1
50            1
53            1
56            2
58            2
60            1
61            1
66            1
74            1
95            1
100           1
121           1
198           1
Name: count, dtype: int64

In [79]:
scem

Unnamed: 0.1,Unnamed: 0,Project ID,authors,author_last_name,Article,details,year,full_citation,cancer_type,cancer_type_abbr,...,instrument,GEO accession,other_metadata,Accession No.,N_samples,protocol,Source,Tissue,Technology,Disease
Paulson_2018_skin,0,MCC-001,"K.G. Paulson et al.,",Paulson,https://pubmed.ncbi.nlm.nih.gov/30250229/,Acquired cancer resistance to combination immu...,2018.0,"K.G. Paulson et al.,Acquired cancer resistance...",Merkel Cell Carcinoma,MCC,...,Illumina HiSeq 2500,GSE117988,The patient had a 22 month clinical response f...,GSE117988,2,10X Genomics,CancerSCEM,skin,10x,Merkel cell carcinoma
Paulson_2018_skin,1,MCC-002,"K.G. Paulson et al.,",Paulson,https://pubmed.ncbi.nlm.nih.gov/30250229/,Acquired cancer resistance to combination immu...,2018.0,"K.G. Paulson et al.,Acquired cancer resistance...",Merkel Cell Carcinoma,MCC,...,Illumina NovaSeq 6000,GSE118056,,GSE118056,2,10X Genomics,CancerSCEM,skin,10x,Merkel cell carcinoma
Laughney_2020_lung,2,LUAD-003,"A.M. Laughney et al.,",Laughney,https://pubmed.ncbi.nlm.nih.gov/32042191/,Regenerative lineages and immune-mediated prun...,2020.0,"A.M. Laughney et al.,Regenerative lineages and...",Lung Adenocarcinoma,LUAD,...,Illumina HiSeq 2500,GSE123904,No smoke,GSE123904,13,10X Genomics,CancerSCEM,lung,10x,Lung adenocarcinoma
Lambrechts_2018_lung,3,LUAD-004,"D. Lambrechts et al.,",Lambrechts,https://pubmed.ncbi.nlm.nih.gov/29988129/,Phenotype molding of stromal cells in the lung...,2018.0,"D. Lambrechts et al.,Phenotype molding of stro...",Lung Adenocarcinoma,LUAD,...,HiSeq4000,,Tumor edge; Former smokers,E-MTAB-6149,6,10X Genomics,CancerSCEM,lung,10x,Lung adenocarcinoma
Lambrechts_2018_lung,4,LUSC-005,"D. Lambrechts et al.,",Lambrechts,https://pubmed.ncbi.nlm.nih.gov/29988129/,Phenotype molding of stromal cells in the lung...,2018.0,"D. Lambrechts et al.,Phenotype molding of stro...",Lung Squamous Cell Carcinoma,LUSC,...,HiSeq4000,,Tumor core; Active smokers and some had mild c...,E-MTAB-6149,6,10X Genomics,CancerSCEM,lung,10x,Lung Squamous Cell Carcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Li_2023_head-and-neck,104,HSCC-130,C. Li et al.,Li,https://pubmed.ncbi.nlm.nih.gov/37503341/,Single-cell RNA sequencing reveals tumor immun...,2023.0,,Hypopharygeal Squamous Cell Carcinoma,HSCC,...,Illumina NovaSeq 6000,GSE227156,,GSE227156,10,GEXSCOPE,CancerSCEM,head-and-neck,GEXSCOPE,Hypopharygeal Squamous Cell Carcinoma
Oh_2023_pancreas,105,PDAC-132,"K. Oh et al.,",Oh,https://pubmed.ncbi.nlm.nih.gov/37633924/,Coordinated single-cell tumor microenvironment...,2023.0,"K. Oh et al.,Coordinated single-cell tumor mic...",Pancreatic Ductal Adenocarcinoma,PDAC,...,Illumina NextSeq 500,GSE231535,T3N1M0; Pathologic diagnosis: mod. Diff. PDAC;...,GSE231535,2,10X Genomics,CancerSCEM,pancreas,10x,PDAC
Yang_2023_colorectal,106,CRC-133,"M. Yang et al.,",Yang,https://pubmed.ncbi.nlm.nih.gov/37576892/,Single-cell analysis reveals cellular reprogra...,2023.0,"M. Yang et al.,Single-cell analysis reveals ce...",Colorectal Cancer,CRC,...,Illumina NovaSeq 6000,GSE232525,,GSE232525,2,10X Genomics,CancerSCEM,colorectal,10x,Colorectal cancer
Song_2023_breast,107,BRCA-134,"Q. Song et al.,",Song,https://pubmed.ncbi.nlm.nih.gov/37479733/,Single-cell sequencing reveals the landscape o...,2023.0,"Q. Song et al.,Single-cell sequencing reveals ...",Breast Cancer,BRCA,...,NextSeq 500,GSE234832,Invasive ductal carcinoma with micropapillary ...,GSE234832,3,10X Genomics,CancerSCEM,breast,10x,Breast cancer
