In [None]:
source(paste0(dirname(getwd()),'/map.r'))
source(paste0(HELP_DIR, "shortcuts.r"))

### 0 - Prepared biomarker data

In [None]:
ready <- 
fread(paste0(SHARE_DIR, "biomarkers_base.csv")) %>% 
 fi(!is.na(purity)) %>% 
 mu(nrBor = abs(bestOverallResponse-1), nrDcb = abs(durableClinicalBenefit-1)) 

In [None]:
#ready %>% se(contains("fusion"))

### 1 - Updating features

In [None]:
base_features <- 
names(ready %>% se( contains("cider_"), contains("clin_"), contains("cn_"), contains("driver_"), 
                    contains("fusion_"), contains("gie_"), contains("lilac_"), contains("neo_"), contains("chord_"), 
                    contains("purity_"), contains("rna_"), contains("sv_"), contains("teal_"), contains("viral_")))

In [None]:
table(unlist(lapply(base_features, function(i) strsplit(i, "_")[[1]][1])))

In [None]:
bin_features <- ready %>% se(all_of(base_features)) %>% select(where(~all(. %in% c(0, 1, NA))))
non_bin_features <- ready %>% se(all_of(base_features)) %>% select(!where(~all(. %in% c(0, 1, NA)))) 

In [None]:
integerer <- function(df){
 df[] <- lapply(df, function(x) if(is.logical(x)) as.integer(x) else x); df    
}

In [None]:
binarify <- function(df, threshold = 50, direction = "gt" ){
 if(direction == "gt"){
   tmp <- df %>% 
    mu(across(everything(), ~ (. > quantile(., threshold/100, na.rm = TRUE)))) %>% 
    rename_with(~ paste0(.x, "_gt", as.character(threshold)))
 } else {
   tmp <- df %>% 
    mu(across(everything(), ~ (. < quantile(., threshold/100, na.rm = TRUE)))) %>% 
    rename_with(~ paste0(.x, "_lt", (as.character(threshold))))
 }
 integerer(tmp)
}

### Bin the continuous features into categorical

- Compute greater than 50 and extract `smooth` features that could be futher refined.

In [None]:
gt50 <- binarify(non_bin_features, 50, "gt")
lt50 <- binarify(non_bin_features, 50, "lt")

In [None]:
smooth_features <- unlist(lapply(names(Filter(function(x) .48 < x && x < .52, apply(gt50, 2, mean, na.rm = TRUE))), function(i) strsplit(i, "_gt50")[[1]][1]))

In [None]:
gt25 <- binarify(non_bin_features %>% se(any_of(smooth_features)), 25, "gt")
gt75 <- binarify(non_bin_features %>% se(any_of(smooth_features)), 75, "gt")
lt25 <- binarify(non_bin_features %>% se(any_of(smooth_features)), 25, "lt")
lt75 <- binarify(non_bin_features %>% se(any_of(smooth_features)), 75, "lt")

- Scale continuous features

In [None]:
features_cont <- 
scale(
ready %>%
 se( purity, all_of(base_features)) %>%  
 select(!where(~ all(. %in% c(0, 1, NA))))
) %>% 
as_tibble()
names(features_cont) <- paste0(names(features_cont), "_cont")

### Combine features and save

In [None]:
features_ready <- cbind(bin_features, features_cont, gt50, lt50, gt25, gt75, lt25, lt75)

In [None]:
ready <- cbind(ready %>% se(-all_of(base_features)), features_ready)

In [None]:
fwrite(ready, paste0(SHARE_DIR, "biomarkers_ready.csv")) 

### Visualise Cohorts