In [None]:
source(paste0(dirname(getwd()),'/map.r'))
source(paste0(HELP_DIR, "shortcuts.r"))

In [None]:
library(forcats)

In [None]:
META_DIR <- paste0(I_DIR, 'metadata/')

### 0 - Read in data to define cohorts

- Metadata

In [None]:
meta <- 
fread( paste0( META_DIR, "metadata_update_feb21_2025.csv")) %>% 
 se(sampleId, primaryTumorLocation, primaryTumorType, primaryTumorExtraDetails)

- Cuppa output

In [None]:
cuppa <- 
fread("/mnt/petasan_immunocomp/datasets/hartwig/cuppa/cuppa_update_processed.csv") %>%
 mu(cuppaPrediction = ifelse(pred_class_combined_1 == "", pred_class_dna_1, pred_class_combined_1 ), 
    cuppaProb = ifelse(is.na(pred_prob_combined_1), pred_prob_dna_1, pred_prob_combined_1 )) %>% 
 se(sampleId, cuppaPrediction, cuppaProb) 

- Combine the cohorts

In [None]:
go <- meta %>% lj(cuppa, by = "sampleId")

### 1 - Define cohorts

- Helper functions

In [None]:
cuppa_map <- 
c("Lung: Non-small cell: LUAD" = "Lung NSCLC (LUAD)",
  "Lung: Non-small cell: LUSC" = "Lung NSCLC (LUSC)",
  "Lung: Small cell" = "Lung SCLC",
  "Skin: Melanoma" = "Skin Melanoma", 
  "NET: Pancreas" = "Pancreas NET",
  "HPB: Pancreas" = "Pancreas PAAD",
  "Prostate" = "Prostate",
  "Breast: Triple negative" = "Breast Triple Negative",
  "Bone/Soft tissue: Undiff. sarcoma" = "Soft tissue Undifferentiated")

In [None]:
breast_classifier <- function(primaryTumorExtraDetails){
 if(is.na(primaryTumorExtraDetails)){"Breast Unknown/Other"}
 else if( primaryTumorExtraDetails == "ER-negative/Her2-negative (triple negative)"){ "Breast Triple Negative"} 
 else if( primaryTumorExtraDetails == "ER-positive/Her2-negative" ){ "Breast ER+/HER-" } 
 else if( primaryTumorExtraDetails == "ER-positive/Her2-positive"){ "Breast ER+/HER+"}      
 else if( primaryTumorExtraDetails == "ER-negative/Her2-positive"){ "Breast ER-/HER+" } 
 else {"Breast Unknown/Other"}   
}
lung_classifier <- function(primaryTumorType, cuppaPrediction, cuppaProb){
 #print(primaryTumorType)
 if( primaryTumorType == "Non-small cell carcinoma (NSCLC), adenocarcinoma" ){ "Lung NSCLC (LUAD)" } 
 else if( primaryTumorType == "Non-small cell carcinoma (NSCLC), squamous cell carcinoma" ){ "Lung NSCLC (LUSC)" }   
 else if( primaryTumorType == "Non-small cell carcinoma (NSCLC), large cell neuroendocrine carcinoma" ){ "Lung NSCLC (LCLC)" }      
 else if( primaryTumorType == "Small cell carcinoma (SCLC)" ){ "Lung SCLC" }   
 else if( grepl("Neuroendocrine tumor", primaryTumorType) ){ "Lung NET" }   
 else if( primaryTumorType == "Non-small cell carcinoma (NSCLC), not otherwise specified" && !is.na(cuppaProb) && cuppaProb > .9){
  if(grepl("LUAD", cuppaPrediction)){ "Lung NSCLC (LUAD)" } 
  else if(grepl("LUSC", cuppaPrediction)){ "Lung NSCLC (LUSC)" } 
  else { "Lung NSCLC (Unknown)" } 
 } else {"Lung Unknown/Other"}
}
skin_classifier <- function(primaryTumorType){
 primaryTumorType = tolower(primaryTumorType)
 if(grepl("melanoma", primaryTumorType)){ "Skin Melanoma"} 
 else if (grepl("basal cell carcinoma", primaryTumorType)){ "Skin Basal Cell"} 
 else if (grepl("squamous cell carcinoma", primaryTumorType)){ "Skin Squamous Cell Carcinoma"}    
 else if (grepl("merkel cell carcinoma", primaryTumorType)){ "Skin Merkel Cell Carcinoma"} 
 else {"Skin Other"}
}
soft_tissue_classifier <- function(primaryTumorType){
 primaryTumorType = tolower(primaryTumorType)  
 if(grepl("leiomyosarcoma", primaryTumorType)){ "Soft tissue Leiomyosarcoma"} 
 else if (grepl("liposarcoma", primaryTumorType)){ "Soft tissue Liposarcoma"} 
 else if (grepl("astrointestinal", primaryTumorType)){ "Soft tissue GIST"}
 else if (grepl("undifferentiated", primaryTumorType)){"Soft tissue Undifferentiated"}
 else if (primaryTumorType == "myxofibrosarcoma"){"Soft tissue Myxofibrosarcoma"}
 else if (primaryTumorType == "angiosarcoma"){"Soft tissue Angiosarcoma"}
 else if (primaryTumorType == "synovial sarcoma"){"Soft tissue Synovial sarcoma"}   
 else if (primaryTumorType == "solitary fibrous tumor"){"Soft tissue Solitary fibrous tumor"}      
 else {"Soft Tissue Sarcoma Unknown/Other"}   
}
pancreas_classifier <- function(primaryTumorType){
 if( primaryTumorType == "Neuroendocrine tumor (NET)" ){ "Pancreas NET"} 
 else if(grepl("Adenocarcinoma", primaryTumorType) | grepl("Unknown", primaryTumorType)){ "Pancreas PAAD"}    
 else {"Pancreas Other"}
}
bladder_classifier <- function(primaryTumorType){
 primaryTumorType = tolower(primaryTumorType)
 if(grepl("urothelial carcinoma", primaryTumorType) | primaryTumorType == "unknown"){ "Bladder Urothelial"} 
 else {"Bladder Other"}
}
kidney_classifier <- function(primaryTumorType){
 if(grepl("Clear cell renal cell carcinoma", primaryTumorType)){ "Kidney (ccRCC)" } 
 else if(grepl("Papillary renal cell carcinoma", primaryTumorType)){ "Kidney (pRCC)" }   
 else {"Kidney Unknown/Other"}
}
colorectum_classifier <- function(primaryTumorType){
 if( primaryTumorType %in% c("Unknown", "Adenocarcinoma") ){ "Colorectal Adenocarcinoma" } 
 else if( primaryTumorType %in% c("Mucinous adenocarcinoma") ){ "Colorectal Mucinous Adenocarcinoma" } 
 else if( grepl("NEC", primaryTumorType) ){ "Colorectal NEC" }        
 else {"Colorectal Other"}
}
brain_classifier <- function(primaryTumorType){
 if( primaryTumorType == "Glioblastoma multiforme (GBM)" ){ "Brain/CNS GBM" } 
 else if( primaryTumorType == "Astrocytoma" ){ "Brain/CNS Astrocytoma" } 
 else {"Brain/CNS Unknown/Other"}
}
cup_classifier <- function(cuppaPrediction, cuppaProb){
 if(!is.na(cuppaProb) && cuppaProb > .9){   
  if(cuppaPrediction %in% names(cuppa_map)){cuppa_map[[cuppaPrediction]]} 
  else { paste0("CUPPA high confidence: ", cuppaPrediction) }
 } else if( !is.na(cuppaProb)) {
  paste0("CUPPA medium confidence: ", cuppaPrediction)
 } else {
  "CUPPA NA"
 }
} 

In [None]:
cohort_map <- c("Esophagus/gastroesophageal junction" = "Gastroesophageal")
cohort_mapper <- function(primaryTumorLocation){
  if(primaryTumorLocation %in% names(cohort_map)){cohort_map[[primaryTumorLocation]]}
  else { primaryTumorLocation }  
}

In [None]:
cohort_classifier <- function( primaryTumorLocation, primaryTumorType, primaryTumorExtraDetails, cuppaPrediction, cuppaProb ){
 if(is.na(primaryTumorLocation) | primaryTumorLocation == "NULL") {primaryTumorType}
 else if( primaryTumorLocation == "Breast" ){ breast_classifier(primaryTumorExtraDetails)}
 else if ( primaryTumorLocation == "Lung" ){ lung_classifier(primaryTumorType, cuppaPrediction, cuppaProb) }
 else if ( primaryTumorLocation == "Soft tissue" ) { soft_tissue_classifier(primaryTumorType) }
 else if ( primaryTumorLocation == "Skin" ) { skin_classifier(primaryTumorType) }
 else if ( primaryTumorLocation == "Bladder" ) { bladder_classifier(primaryTumorType) }   
 else if ( primaryTumorLocation == "Pancreas" ) { pancreas_classifier(primaryTumorType) } 
 else if ( primaryTumorLocation == "Kidney" ) { kidney_classifier(primaryTumorType) }    
 else if ( primaryTumorLocation %in% c("Colon", "Rectum") ) { colorectum_classifier(primaryTumorType) }    
 else if ( primaryTumorLocation == "Brain/central nervous system" ) { brain_classifier(primaryTumorType) }
 else if ( primaryTumorLocation %in% c("NULL", "Cancer of unknown primary (CUP)") ) { cup_classifier(cuppaPrediction, cuppaProb) }
 else { cohort_mapper(primaryTumorLocation) }
}

### 1 - Run it

In [None]:
cohorts_ready <- 
go %>% 
 rw() %>% 
 mu(cohort = cohort_classifier(primaryTumorLocation, primaryTumorType, primaryTumorExtraDetails, cuppaPrediction, cuppaProb)) %>% 
 ug() %>% 
 unique() %>% 
 drop_na(cohort)

In [None]:
fwrite(
cohorts_ready %>% 
 gb(primaryTumorLocation, cohort) %>% 
 su(ct = n()) %>% 
 gb(primaryTumorLocation) %>% mu(tot = sum(ct)) %>% ar(desc(tot), desc(ct)), 
 "summary.csv")

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)

In [None]:
cohorts_ready %>% 
 mu(same = ifelse(primaryTumorLocation %in% c("Breast", "Lung", "Soft tissue", "Skin", "Bladder", "Pancreas"),
                  "Location + Granular Definition", "Primary Tumor Location Based")) %>% 
 mu(colors = ifelse(same == "Primary Tumor Location Based", "Location Defined", primaryTumorLocation)) %>% 
 gb(primaryTumorLocation, cohort, same, colors) %>% 
 su(ct = n()) %>% 
 ar(desc(ct)) %>%
 ggplot(aes(y = fct_reorder(cohort, ct), x = ct, fill = colors)) + geom_bar(stat = "identity", color = "black") + 
 facet_wrap(~same, scales = "free") + 
 go_theme + 
 labs(title = "Cohort Definition Counts", x = "Samples", y = "") + 
 theme(legend.position = c(0.3, 0.4))

### 2- Send prepared cohorts file

In [None]:
fwrite(cohorts_ready, paste0(META_DIR, "cohorts/cohorts_ready.csv"))

In [None]:
paste0(META_DIR, "cohorts/cohorts_ready.csv")

In [None]:
paste0(META_DIR, "cohorts/cohorts_ready.csv")