In [1]:
suppressMessages(source("exp/nb_04.R"))

“package ‘survival’ was built under R version 4.1.1”


### Load and prepare dataframes

In [None]:
dfs = readRDS("../data/dfs_filtered.rds")

In [None]:
dfs = map(dfs, impute_and_clean)

In [None]:
sd_filter = function(df, thresh){
    df_sd = sapply(df[3:ncol(df)], sd)
    cols = names(df_sd[df_sd > thresh])
    df[,c("survival_time", "event",cols)]
}

In [None]:
σ = purrr::partial(sd_filter, thresh=0.2)

In [None]:
dfs = map(dfs, σ)
dfs[[3]] = sd_filter(dfs[[3]], thresh = 0.6)
dfs[[7]] = sd_filter(dfs[[7]], thresh = 2.0)

In [None]:
#map(dfs, function(df) dim(df)[2])

In [None]:
saveRDS(dfs, file = "../data/dfs_filtered_for_feature_selection.rds")

### Load

In [2]:
dfs = readRDS("../data/dfs_filtered_for_feature_selection.rds")

### Learning algorithms

In [3]:
#export
learning_algs =   c("Cox_PH_model",
                    "Ridge",
                    "Elastic_Net",
                    "Lasso",
                    "Gradient_Boosting_tree_based",
                    "Gradient_Boosting_linear_model_based",
                    "Random_Survival_Forests",
                    "Maximally_selected_rank_statistics_Random_Forests",
                    "Survival_Tree"
                   )

#export
learners = list(makeLearner("surv.coxph",           id = learning_algs[[1]]),
                makeLearner("surv.cvglmnet",        id = learning_algs[[2]], alpha = 0,   nfolds=20),
                makeLearner("surv.cvglmnet",        id = learning_algs[[3]], alpha = 0.5, nfolds=20, s="lambda.min"),
                makeLearner("surv.cvglmnet",        id = learning_algs[[4]], alpha = 1,   nfolds=20, s="lambda.min"),
                makeLearner("surv.gamboost",        id = learning_algs[[5]], baselearner = "bols" ),
                makeLearner("surv.gamboost",        id = learning_algs[[6]], baselearner = "btree"),
                makeLearner("surv.randomForestSRC", id = learning_algs[[7]]),
                makeLearner("surv.ranger",          id = learning_algs[[8]]),
                makeLearner("surv.rpart",           id = learning_algs[[9]])
               )
names(learners) = learning_algs

### Metalearner with feature selections

In [4]:
#export
metalearner = function(df, learner, feature_selector){
    task  = makeSurvTask(data = df, target = c("survival_time", "event"))
    inner = makeResampleDesc("CV", iters=5)
    n = 20  # number of features
    feature_selectors = c("univariate_model_score", "mrmr", "randomForestSRC_importance",
                          "randomForestSRC_var_select_md", "randomForestSRC_var_select_vh", 
                          "party_cforest_importance")
    
    if(!(feature_selector %in% feature_selectors)){
       stop("feature_selector must be one of ", feature_selectors)   
    }
    
    if (feature_selector == "univariate_model_score"){
        lrn = makeFilterWrapper(learner = learner, fw.method="univariate.model.score",    fw.abs = n, 
                                perf.learner=learner)
    }
    else if (feature_selector == "mrmr"){
        lrn = makeFilterWrapper(learner = learner, fw.method="mrmr",                        fw.abs = n)
    }
    else if (feature_selector == "randomForestSRC_importance"){
        lrn  = makeFilterWrapper(learner = learner, fw.method="randomForestSRC_importance", fw.abs = n)
    }
    else if (feature_selector == "randomForestSRC_var_select_md"){
        lrn  = makeFilterWrapper(learner = learner, fw.method="randomForestSRC_var.select", fw.abs = n, 
                         more.args = list("randomForestSRC_var.select"=list(method="md")))
    }
    else if (feature_selector == "randomForestSRC_var_select_vh"){
        lrn  = makeFilterWrapper(learner = learner, fw.method="randomForestSRC_var.select", fw.abs = n, 
                         more.args = list("randomForestSRC_var.select"=list(method="vh")))
    }
    else if (feature_selector == "party_cforest_importance"){
        lrn  = makeFilterWrapper(learner = learner, fw.method="party_cforest.importance",   fw.abs = n)
    }
    
    res   = resample(learner = lrn, task = task, resampling=inner, models=TRUE, show.info  = FALSE)
    return(res)
}

In [5]:
feature_selectors = c("univariate_model_score", "mrmr", "randomForestSRC_importance",
                          "randomForestSRC_var_select_md", "randomForestSRC_var_select_vh", 
                          "party_cforest_importance")

In [6]:
data_path = "../data/metalearners/metalearners_from_feature_selections/"

In [11]:
d = 1
l = 1
f = 2
filename = paste(data_path,"ml_",
                               names(dfs)[[d]],"_",
                               names(learners)[[l]], "_",
                               feature_selectors[[f]],
                               ".rds", sep="")

## RUN

In [None]:
pb   = txtProgressBar(2, length(dfs), style=3)
TIME = Sys.time()
for (d in 1:length(dfs)){                          # dataframes
    print(names(dfs)[[d]])
    for (l in 1:length(learners)){                 # learners
        for (f in 2:length(feature_selectors)){    # feature selections except univariate_model_score
            setTxtProgressBar(pb, f)
            
            filename = paste(data_path,"ml_",
                               names(dfs)[[d]],"_",
                               names(learners)[[l]], "_",
                               feature_selectors[[f]],
                               ".rds", sep="")
            
            if (file.exists(filename)){next}
            else {
            
            df               = dfs[[d]]
            learner          = learners[[l]]
            feature_selector = feature_selectors[[f]]
            
            res = metalearner(df, learner, feature_selector)
            
            saveRDS(res, filename)
                
            }
        }
    }
    Sys.sleep(1/length(dfs))
}

[1] "METABRIC"

Loading required package: mboost

Loading required package: stabs


Attaching package: ‘stabs’


The following object is masked from ‘package:randomForestSRC’:

    subsample


The following object is masked from ‘package:mlr’:

    subsample



Attaching package: ‘mboost’


The following object is masked from ‘package:tidyr’:

    extract


The following object is masked from ‘package:ggplot2’:

    %+%




Variable importance for survival forests; this feature is _experimental_


Variable importance for survival forests; this feature is _experimental_


Variable importance for survival forests; this feature is _experimental_


Variable importance for survival forests; this feature is _experimental_


Variable importance for survival forests; this feature is _experimental_


## Tmp

In [None]:
#learners_ums = learners[-c(2,3,4,5,6)]
#task = makeSurvTask(data = df, target = c("survival_time", "event"))
#model= learners_ums[[1]]
#inner= makeResampleDesc("CV", iters=5)
#lrn  = makeFilterWrapper(learner = model, fw.method="univariate.model.score", fw.abs=10, perf.learner=model)
#res  = resample(learner = lrn, task = task, resampling=inner, models=TRUE)

### MRMR

In [None]:
#k=1
#task = makeSurvTask(data = df, target = c("survival_time", "event"))
#model= learners[[k]]
#inner= makeResampleDesc("CV", iters=5)
#lrn  = makeFilterWrapper(learner = model, fw.method="mrmr", fw.abs = 20)
#res  = resample(learner = lrn, task = task, resampling=inner, models=TRUE)

### randomForestSRC_importance

In [None]:
#k=9
#task = makeSurvTask(data = df, target = c("survival_time", "event"))
#model= learners[[k]]
#inner= makeResampleDesc("CV", iters=5)
#lrn  = makeFilterWrapper(learner = model, fw.method="randomForestSRC_importance", fw.abs = 20)
#res  = resample(learner = lrn, task = task, resampling=inner, models=TRUE)

In [None]:
#for (k in 1:length(learners)){
#task = makeSurvTask(data = df, target = c("survival_time", "event"))
#model= learners[[k]]
#inner= makeResampleDesc("CV", iters=5)
#lrn  = makeFilterWrapper(learner = model, fw.method="randomForestSRC_var.select", fw.abs = 20, 
#                         more.args = list("randomForestSRC_var.select"=list(method="md")))
#res  = resample(learner = lrn, task = task, resampling=inner, models=TRUE)
#    }

In [None]:
#for (k in 1:length(learners)){
#task = makeSurvTask(data = df, target = c("survival_time", "event"))
#model= learners[[k]]
#inner= makeResampleDesc("CV", iters=5)
#lrn  = makeFilterWrapper(learner = model, fw.method="randomForestSRC_var.select", fw.abs = 20, 
#                         more.args = list("randomForestSRC_var.select"=list(method="vh")))
#res  = resample(learner = lrn, task = task, resampling=inner, models=TRUE)
#    }

In [None]:
#for (k in c(1,2,5,7,8,9)){
#task = makeSurvTask(data = df, target = c("survival_time", "event"))
#model= learners[[k]]
#inner= makeResampleDesc("CV", iters=5)
#lrn  = makeFilterWrapper(learner = model, fw.method="party_cforest.importance", fw.abs = 20)
#res  = resample(learner = lrn, task = task, resampling=inner, models=TRUE, show.info  = FALSE)
#    }