In [None]:
source(paste0(dirname(getwd()),'/map.r'))
source(paste0(HELP_DIR, "shortcuts.r"))

### 0 - Prepared biomarker data

In [None]:
ready <- 
fread(paste0(SHARE_DIR, "biomarkers_base.csv")) %>% 
 fi(!is.na(bestOverallResponse), !is.na(purity)) %>% 
 mu(nrBor = abs(bestOverallResponse-1), nrDcb = abs(durableClinicalBenefit-1)) 

### 1 - Updating features

In [None]:
base_features <- names(ready %>% se(contains("_")) %>% se(-lilac_hla_supertype))

In [None]:
K <- 100

In [None]:
bin_features <- ready %>% se(all_of(base_features)) %>% select(where(~all(. %in% c(0, 1, NA))))
non_bin_features <- ready %>% se(all_of(base_features)) %>% select(!where(~all(. %in% c(0, 1, NA)))) 

In [None]:
integerer <- function(df){
 df[] <- lapply(df, function(x) if(is.logical(x)) as.integer(x) else x)
 df    
}

- Group continuous features into categorical

In [None]:
features_med <- non_bin_features %>% mu(across(everything(), ~ ifelse(. > quantile(., .5, na.rm = TRUE), TRUE, FALSE))) 
names(features_med) <- paste0(names(features_med), "_med")
features_med <- integerer(features_med)

In [None]:
features_high <- non_bin_features  %>% mu(across(everything(), ~ ifelse(. > quantile(., 0.75, na.rm = TRUE), TRUE, FALSE)))
names(features_high) <- paste0(names(features_high), "_high")
features_high <- integerer(features_high)

In [None]:
features_low <- non_bin_features %>% mu(across(everything(), ~ ifelse(. < quantile(., 0.25, na.rm = TRUE), TRUE, FALSE)))
names(features_low) <- paste0(names(features_low), "_low")
features_low <- integerer(features_low)

- Scale continuous features

In [None]:
features_cont <- 
scale(
ready %>%
 se( purity, all_of(base_features)) %>%  
 select(!where(~ all(. %in% c(0, 1, NA))))
) %>% 
as_tibble()
names(features_cont) <- paste0(names(features_cont), "_cont")

In [None]:
features_prep <- cbind(features_cont, features_low, features_med, features_high, bin_features )

In [None]:
ready <- cbind(ready %>% se(-all_of(base_features)), features_prep)

### Just look at response in cohorts

In [None]:
ready %>%
 gb(concatenatedTreatmentMechanism) %>%
 su(tot = n(), 
    non_response_bor = sum(nrBor), 
    non_response_dcb = sum(nrDcb)) %>% 
 fi(tot > 50)

### 01 - Looks at non-responder counts

In [None]:
cts <- 
ready %>% 
 se(treatmentsMechanism, nrDcb, all_of(c(names(binary_features), names(features_med), names(features_low), names(features_high)))) %>% 
 ga(feature, val, -nrDcb, -treatmentsMechanism) %>% 
 gb(treatmentsMechanism, feature) %>% 
 su(total_patient = n(), 
    total_non_responders = sum(nrDcb), 
    pct_non_responder = mean(nrDcb), 
    tot_feature = sum(val), 
    non_responders_given_feature = sum(nrDcb*val),
    pct_non_responder_given_feature = mean(ifelse(val == 1, nrDcb, NA), na.rm = TRUE), 
    non_responders_no_feature = total_non_responders - non_responders_given_feature ,
    pct_non_responder_no_feature = mean(ifelse(val == 0, nrDcb, NA), na.rm = TRUE)
   ) %>% 
 ug() %>% 
 ar(desc(pct_non_responder), desc(total_non_responders))

### 1 - Define cohorts

- Exact

In [None]:
top_mechanisms <- 
ready %>% 
 gb(treatmentsMechanism) %>% 
 su(ct = n()) %>% 
 fi(ct > K) %>% 
 pu(treatmentsMechanism)

In [None]:
mechanism_exact <- list()
for( i in top_mechanisms){
 mechanism_exact[[i]] <- ready %>% fi(treatmentsMechanism == i)   
}

- Contains

In [None]:
raw_mechanisms <- 
get_query_data("execute_sql_on_prod 'select * from hmfpatients.drug'") %>% 
 gb(patientId, mechanism) %>% 
 su(ct = n(), .groups = "drop") %>% 
 gb(mechanism) %>% 
 su(ct = sum(ct > K), .groups = "drop") %>% 
 ar(desc(ct)) %>% 
 pu(mechanism)

In [None]:
raw_mechanisms <- unlist(lapply(raw_mechanisms, function(i) strsplit(i, " ")[[1]][1]))

In [None]:
cts <- data.frame()
for( i in c(unique(raw_mechanisms))){
 cts <- rbind(cts, data.frame( trt = i, ct = nrow(ready %>% fi(grepl(i, treatmentsMechanism)))))
}
top_mechanisms_contain <- cts %>% ar(desc(ct)) %>% fi(ct > K) %>% pu(trt)

In [None]:
mechanism_contain <- list()
for( i in top_mechanisms_contain){
 mechanism_contain[[i]] <- ready %>% fi( grepl(i, treatmentsMechanism))   
}

- Tissue specific

In [None]:
tissue_cts <- function( cohort_list ){
 top <- list()
 for( i in names(cohort_list)){
  if(!is.na(i)){
  top[[i]] <- 
   cohort_list[[i]] %>% 
    gb(cohort) %>% 
    su(ct = n()) %>% 
    fi(ct > K) %>% 
    pu(cohort)
 }}
 top
}

In [None]:
storer <- function( cohort_list, top_list){
 storage <- list()
 for( i in names( cohort_list) ){
 if(length(top_list[[i]] > 0 ))   
 for( j in top_list[[i]]){
   storage[[i]][[j]] <- cohort_list[[i]] %>% fi(cohort == j)
 }}
 storage
}

In [None]:
top_exact <- tissue_cts(mechanism_exact)
top_contains <- tissue_cts(mechanism_contain)

In [None]:
tissue_exact <- storer(mechanism_exact, top_exact)
tissue_contains <- storer(mechanism_contain, top_contains)

In [None]:
#top_contains

### 2 - Send them!

In [None]:
go <- 
list("exact" = mechanism_exact, 
     "contains" = mechanism_contain, 
     "tissue_exact" = tissue_exact, 
     "tissue_contains" = tissue_contains, 
     "features" = names(features_prep))

In [None]:
names(go$"tissue_exact"$'Anti-PD-1')

In [None]:
saveRDS(go, "biomarker_cohorts2.Rds")

### 3 - Compare

In [None]:
old <- readRDS("biomarker_cohorts.Rds")