## Loading Packages & Initialization

In [None]:
rm(list=ls())

library(data.table)
library(tidyverse)
library(rJava)
library(RNetLogo)
library(lhs)

options(warn = 0)

In [None]:
folder.path = "/Users/ecemnaz.yildiz/Documents/Personal/Thesis/"
source(paste0(folder.path,"ThesisSetupCode.r"))

Is_Headless <- 1
nl.model <- "info_cascade_update_TDP_JPF_2020" #"Segregation_Dummy"

nl.path <- "/Users/ecemnaz.yildiz/Documents/NetLogo 6.0.4/Java"
folder.path = "/Users/ecemnaz.yildiz/Documents/Personal/Thesis/"

model.path <- paste0(folder.path, nl.model, ".nlogo")

if (Is_Headless == 0) {
    NLStart(nl.path, gui = TRUE, nl.jarname = "netlogo-6.0.4.jar")
    NLLoadModel(model.path)
} else {
    NLStart(nl.path, gui = FALSE, nl.jarname = "netlogo-6.0.4.jar", nl.obj = nl.model)
    NLLoadModel(model.path, nl.obj = nl.model)
}

In [None]:
model.type = "info_cascade_update" ##ifelse(nl.model == "Segregation", "basic", "dummy")
# the path of data folder

training_set_size = 500 #75 #500
training_set_seed = 9
training_set_date = "2022-02-13"

test_set_size = 215 #30 #215
test_set_seed = 8
test_set_date = "2022-02-12"

data.path = paste0(folder.path,"Data_",training_set_size,"_Seed",training_set_seed,"/")

# the path for outputs to be record
output.folder = paste0("outputs_V3_RFE_mtrymultip2_",model.type,"_",Sys.Date(),"_",format(Sys.time(), "%H.%M"),"_",training_set_size,"_",training_set_seed)
dir.create(file.path(folder.path, output.folder), showWarnings = FALSE)

outputs.path = paste0(folder.path,output.folder,"/")

# Read Me File to keep info about the output folder
ReadMe = paste0(outputs.path,"ReadMe_",model.type,".txt")

In [None]:
data.path

## Model Parameters & Functions

### Set model parameters

In [None]:
#### Model Parameters #### Set model parameters Number of replications for each
#### instance
nofrep = 30 #############################

# order feature names according to their definition order in run_model

    feature_names = c(
    "max_links",
    "evidence",
    "sc-bel-prop",
    "prop-likelihood",
    "n_init_believers",
    "prior-mean",
    "prior-sd",
    "expertise_influence") 
    feature_ranges = data.table(  feature   = feature_names
                                , min_range = c(2, 0, 0, 0, 0, 0, 0, 0)
                                , max_range = c(500, 100, 5, 1, 100, 1, 1, 1)
                               )

# 
output_name = c("cl-prop-same")

# Number of input parameters of the agent-based model
nofparams = length(feature_names)

# set RF parameters
ntree = 300
#mtry = 2
mtry.multiplier = 2 # when 1, it is default, when 2, it is at most twice of defaults 
nperm = 5

feature_importance_threshold = 10

### Set user parameters

In [None]:
#### User parameters ####
error_type = "RMSE"  # MAPE, BIAS

# choose the uncertainty measure
selection_metric <- "coefvar"  #, 'range' 
sample.type = paste0("AdFe_",selection_metric)

elimination.type = "NRFE" # or "RFE"

# Number of iterations
iteration_budget = 11
metarep = c(1:10)

# Number of instances
unlabeled_ins = 30
test_ins = 215 #30 #215 ##c(100,400)
train_ins_oneshot = 500 #75 #500
train_ins_Ad = 500 #75 #500 ##50

# Set selection parameters
selected_ins = 5  #nofinstancesWillbeSelected in each step

# Set elimination parameter
p = 0.5 # elimination proportion
# h = 1
oob_allowance = 0.5#0.1 #0.01

seed.focus = 9 ##c(1,2,3,4,5,6,7,8,9,20)

## !!!
unlabeled.type = "refresh and ElimInducedSampling"

# Decide on strategy:
elimination_start_iter = 5 #8 #6 #7 #2 #3 #4 #5

log_entry()

## Test Set

In [None]:
#### Test Sets ####
test_set = data.table()
for( t in test_set_size){
    test_set.name= paste0(data.path,"test_set_",model.type,"_",t,"_seed",test_set_seed,"_",test_set_date,".csv")
    test_set_Sub <- fread(test_set.name)  
    
    test_set = rbind(test_set, data.table(size = t, test_set_Sub))
    
    #assign(paste0("test_set_",t),test_set)
}

## Adaptive Training Set

In [None]:
adaptive_initial_data = upload_training_set(model.type,training_set_seed,training_set_size,training_set_date)

### Adaptive & Feature Elimination Train & Test Metamodel

In [None]:
sample.type = paste0("AdFe_",selection_metric)
sample.folder = paste0(sample.type,"/")
dir.create(file.path(folder.path, output.folder,sample.folder), showWarnings = FALSE)

models.folder = paste0("models_",sample.type,"/")
dir.create(file.path(folder.path, output.folder,models.folder), showWarnings = FALSE)

PL.folder = paste0("PL_",sample.type,"/")
dir.create(file.path(folder.path, output.folder,PL.folder), showWarnings = FALSE)

for(i in seed.focus){ print(paste0("seed : ",i,"  Adaptive Sampling with Feature Selection section start time : ",Sys.time()))    
    for (r in metarep){ print(paste0("seed : ", i,"   rep : ", r, "  Adaptive Sampling with Feature Selection section start time : ", Sys.time()))
        set.seed(i + r)
            
        training_set_Ad = copy(adaptive_initial_data[seed == i, .SD, .SDcols = -c("seed")])
        train_candidates_table = data.table()
        
        columns_left = feature_names # reset at the beginning of each iteration
        total_numof_eliminated_vars <- 0 # reset at the beginning of each iteration
        eliminated_columns = c()
    
        iteration_history = data.table("seed" = integer(),"rep" = integer(),"iter_no" = integer()
                              ,"IsFeatureEliminated" = logical(), "IsDataSelected" = logical()
                              ,"NumOfEliminated" = integer(), "RankedUpd" = logical())
        iter = 1
        while(iter <= iteration_budget){   
            print(iter)
            run_log_entry()
    
            trainx = training_set_Ad[,.SD, .SDcols = columns_left]
            trainy = training_set_Ad$output
        
            run_step_log_entry("Model Training Start.")
            
            set.seed(seed.focus) ### KONTROL
            
            # Train the model
            model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE
                                      ,ntree = ntree, nperm = nperm
                                      ,mtry = mtry_default(columns_left) * mtry.multiplier)
                model_Sub.name = paste0("model_",sample.type,"_", iter, "_seed_", i, "_rep_",r)
                model_Sub.path = paste0(outputs.path,models.folder, paste0(model_Sub.name,"_size_",train_ins_Ad, ".rds"))  # to save the model
                saveRDS(model_Sub, model_Sub.path)
        
            iteration_history= rbind(iteration_history,data.table(i,r,iter,0,0,0,0), use.names = FALSE)
            # update VIM or not
            if (elimination.type == "RFE" | (elimination.type == "NRFE" & (length(columns_left) == length(feature_names)))){
                ranked_features = get_variable_importance(model_Sub)
                iteration_history[iter]$RankedUpd= 1 
                
            }     
       
            # write errors 
            obb_err = obb_error_func(model_Sub)     
            fwrite(data.table(iter,obb_error = obb_err,seed = i,rep = r)
                   ,paste0(outputs.path,sample.folder,model.type,"_","obb_error_",sample.type,".csv") ,append = TRUE)
        
            write_test_accuracy(i,r,iter,model_Sub,test_set, error_type)
            write_importance.rf(i,r,iter,model_Sub,sample.type)#last one=sample_type
        
            if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached. 
                
                run_step_log_entry("Sample Selection Start.")
         
                ### SAMPLE SELECTION ###    
                #select samples first but not to add to the training set until eliminated_features are specified.
                # select new data candidates before elimination
                ## sample selection from unlabeled data select candidates
                unlabeled_set <- refresh_sample_pool(i + r + iter, columns_left)
                train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub,selection_metric)
                
                run_step_log_entry("ABM Run Start.")
                
                # run ABM to find outputs of train candidates
                print(paste0("ABM train_candidate run start time : ",Sys.time()))
                train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
                
                run_step_log_entry("ABM Run End.")
                
                print(paste0("ABM train_candidate run end time : ",Sys.time()))
                
                fwrite(data.table(train_candidates, "iter" = iter, "seed" = i, "rep" = r)
                       ,paste0(outputs.path,sample.folder,model.type,"_train_candidates_table_",sample.type,".csv"),append = TRUE )      

                ### SAMPLE SELECTION ENDS ###
                
                ### FEATURE ELIMINATION ###
                if(elimination_start_iter <= iter & length(columns_left) >= 2){ #######ilk deneylerde eşitlik yoktu.
                    check_elim = TRUE 
                    apply_elim = FALSE
                    # 
                ### FEATURE ELIMINATION PART I ###
                #decide how many features will be eliminated
                    elim_check_iter = 1
                    h = floor(length(columns_left) * (p^elim_check_iter))
                    while(check_elim){
                        
                        set.seed(seed.focus) ### KONTROL
                        
                        run_step_log_entry("Feature Selection Start.")
    
                        # Assume as if feature(s) will be eliminated
                        feature_elimination_result = feature_elimination(h, columns_left, ranked_features)
                        planned_columns_left = feature_elimination_result[[1]]
                        
                        run_step_log_entry("New Random Forest Model Generation.")
                        
                        model_Sub_afterElim <- randomForest(  x = training_set_Ad[,.SD, .SDcols = planned_columns_left]
                                                             ,y =  training_set_Ad$output
                                                             ,importance = TRUE, nperm = nperm
                                                             ,ntree = ntree
                                                            , mtry = mtry_default(planned_columns_left) * mtry.multiplier)        
                            model_Sub_afterElim.name = paste0("model_afterElim_",sample.type,"_", iter, "_seed_", i, "_rep_",r,"_h_",h)
                            model_Sub_afterElim.path = paste0(outputs.path,models.folder, paste0(model_Sub_afterElim.name,"_size_",train_ins_Ad, ".rds"))  # to save the model
                            saveRDS(model_Sub_afterElim, model_Sub_afterElim.path)

                        run_step_log_entry("New Random Forest Model OOB Calculation.")

                        new_oob = obb_error_func(model_Sub_afterElim)
                        
                        fwrite(data.table(iter,new_oob_error = new_oob,oob_error = obb_err,seed = i,rep = r)
                               ,paste0(outputs.path,sample.folder,model.type,"_","new_oob_error_",sample.type,".csv"),append = TRUE)
        
                        if(new_oob < (obb_err + obb_err * oob_allowance)){ 

                            run_step_log_entry("New Random Forest Model Selected.")

                            check_elim = FALSE 
                            apply_elim = TRUE                            

                        } else {
                            
                            run_step_log_entry("New Random Forest Model is not Selected.")

                            elim_check_iter = elim_check_iter + 1
                            h_upd = floor(length(columns_left) * (p^elim_check_iter)) 
                            
                            if(h_upd == h){ # if h does not change
                                check_elim = FALSE    
                            }
                            
                            h = copy(h_upd)
                        }
                     }             
               ### FEATURE SELECTION PART II ###
               # really eliminate 
                    if(apply_elim){
                        
                        run_step_log_entry("Feature Elimination Applied.")
                        
                        # update iteration_history
                        iteration_history[iter]$IsFeatureEliminated= 1
                        iteration_history[iter]$NumOfEliminated= length(columns_left) - length(planned_columns_left)
                
                        columns_left = planned_columns_left
                        eliminated_columns =  feature_elimination_result[[4]]

                        run_step_log_entry("Eliminated Columns Recorded.")
                    
                    }         
               }
              ### FEATURE SELECTION ENDS ###
            
              # add labeled candidates to the train data
              training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")],use.names = TRUE)
              # update iteration_history
              iteration_history[iter]$IsDataSelected= 1

                run_step_log_entry("Labeled Data Added to Training Set.")
  
            }
            fwrite(iteration_history[iter],paste0(outputs.path,sample.folder,model.type,"_iteration_history_",sample.type,".csv"),append = TRUE )       

            run_step_log_entry(paste0("Iteration history Updated. Iteration ", iter, " Ends."))

            iter = iter + 1

        }
        
        run_step_log_entry("Final Train Data File Recorded.")
    
        fwrite(data.table(training_set_Ad, "seed" = i,"rep" = r),paste0(outputs.path,sample.folder,model.type,"_FinalTrainData_",sample.type,".csv") ,append = TRUE)

        run_step_log_entry("Eliminated Columns File Recorded.")

        fwrite(data.table("seed" = i,"rep" = r, "elim_cols" =  eliminated_columns),paste0(outputs.path,sample.folder,model.type,"_EliminatedColumns_",sample.type,".csv") ,append = TRUE)

        run_step_log_entry(paste0("seed : ",i,"   rep : ", r,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))       
                       
    ##    print(paste0("seed : ",i,"   rep : ", r,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))
    }
    
    run_step_log_entry(paste0("seed : ",i,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))                 
                     
##    print(paste0("seed : ",i,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))
    #rm(training_set_Ad,predictedLabels_table,train_candidates_table)      
}

### Adaptive & Feature Elimination Train & Test Metamodel - simplified

In [None]:
sample.type = paste0("AdFe_",selection_metric)
sample.folder = paste0(sample.type,"/")
dir.create(file.path(folder.path, output.folder,sample.folder), showWarnings = FALSE)

models.folder = paste0("models_",sample.type,"/")
dir.create(file.path(folder.path, output.folder,models.folder), showWarnings = FALSE)

PL.folder = paste0("PL_",sample.type,"/")
dir.create(file.path(folder.path, output.folder,PL.folder), showWarnings = FALSE)

for(i in seed.focus){ 
    #print(paste0("seed : ",i,"  Adaptive Sampling with Feature Selection section start time : ",Sys.time()))    
    for (r in metarep){ 
        #print(paste0("seed : ", i,"   rep : ", r, "  Adaptive Sampling with Feature Selection section start time : ", Sys.time()))
        set.seed(i + r)
            
        training_set_Ad = copy(adaptive_initial_data[seed == i, .SD, .SDcols = -c("seed")])
        train_candidates_table = data.table()
        
        columns_left = feature_names # reset at the beginning of each iteration
        total_numof_eliminated_vars <- 0 # reset at the beginning of each iteration
        eliminated_columns = c()
    
        iteration_history = data.table("seed" = integer(),"rep" = integer(),"iter_no" = integer()
                              ,"IsFeatureEliminated" = logical(), "IsDataSelected" = logical()
                              ,"NumOfEliminated" = integer(), "RankedUpd" = logical())
        iter = 1
        while(iter <= iteration_budget){   
            #print(iter)
            run_log_entry()
    
            trainx = training_set_Ad[,.SD, .SDcols = columns_left]
            trainy = training_set_Ad$output
        
            run_step_log_entry("Model Training Start.")
            
            set.seed(9) ### KONTROL
            
            # Train the model
            model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE
                                      ,ntree = ntree, nperm = nperm
                                      ,mtry = mtry_default(columns_left) * mtry.multiplier)
                model_Sub.name = paste0("model_",sample.type,"_", iter, "_seed_", i, "_rep_",r)
                model_Sub.path = paste0(outputs.path,models.folder, paste0(model_Sub.name,"_size_",train_ins_Ad, ".rds"))  # to save the model
                saveRDS(model_Sub, model_Sub.path)
        
            iteration_history= rbind(iteration_history,data.table(i,r,iter,0,0,0,0), use.names = FALSE)
            # update VIM or not
            if (elimination.type == "RFE" | (elimination.type == "NRFE" & (length(columns_left) == length(feature_names)))){
                ranked_features = get_variable_importance(model_Sub)
                iteration_history[iter]$RankedUpd= 1 
                
            }     
       
            # write errors 
            obb_err = obb_error_func(model_Sub)     
            fwrite(data.table(iter,obb_error = obb_err,seed = i,rep = r)
                   ,paste0(outputs.path,sample.folder,model.type,"_","obb_error_",sample.type,".csv") ,append = TRUE)
        
            write_test_accuracy(i,r,iter,model_Sub,test_set, error_type)
            write_importance.rf(i,r,iter,model_Sub,sample.type)#last one=sample_type
        
            if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached. 
                
                run_step_log_entry("Sample Selection Start.")
         
                ### SAMPLE SELECTION ###    
                #select samples first but not to add to the training set until eliminated_features are specified.
                # select new data candidates before elimination
                ## sample selection from unlabeled data select candidates
                unlabeled_set <- refresh_sample_pool(i + r + iter, columns_left)
                train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub,selection_metric)
                
                run_step_log_entry("ABM Run Start.")
                
                # run ABM to find outputs of train candidates
                #print(paste0("ABM train_candidate run start time : ",Sys.time()))
                train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
                
                run_step_log_entry("ABM Run End.")
                
                #print(paste0("ABM train_candidate run end time : ",Sys.time()))
                
                fwrite(data.table(train_candidates, "iter" = iter, "seed" = i, "rep" = r)
                       ,paste0(outputs.path,sample.folder,model.type,"_train_candidates_table_",sample.type,".csv"),append = TRUE )      

                ### SAMPLE SELECTION ENDS ###
                
                ### FEATURE ELIMINATION ###
                if(elimination_start_iter <= iter & length(columns_left) >= 2){ #######ilk deneylerde eşitlik yoktu.
                    check_elim = TRUE 
                    apply_elim = FALSE
                ########################################  TRY FEATURE IMPORTANCE #################################
                    feature_importance <- importance(model_Sub, type = 1, scale = FALSE)
                    range <- 1:nrow(feature_importance)
                    run_step_log_entry("Elimination Procedure Start.")
                    planned_columns_left <- vector()
                    candidate_eliminated_columns <- vector()
                    run_step_log_entry(paste0(feature_importance)) #To record importance of each feature in current model.
                    for(j in range){
                        run_step_log_entry(paste0("For loop step number: ", j))
                        
                        if(feature_importance[j,] >= feature_importance_threshold){
                            run_step_log_entry(paste0("Keep:"  ,rownames(feature_importance)[j]))
                            run_step_log_entry(paste0("Importance: " ,feature_importance[j,]))
                            planned_columns_left <- rbind(planned_columns_left,(rownames(feature_importance)[j]))
                        }
                        else{
                            run_step_log_entry(paste0("Eliminate: ", rownames(feature_importance)[j]))
                            candidate_eliminated_columns <- rbind(candidate_eliminated_columns,(rownames(feature_importance)[j]))
                        }
                    }

                    if(nrow(planned_columns_left) < nrow(feature_importance)){
                        run_step_log_entry("Approve Elimination.")
                        run_step_log_entry(paste0("All Features: ", nrow(feature_importance)))
                        run_step_log_entry(paste0("Features Left: ",nrow(planned_columns_left)))                        
                        apply_elim = TRUE
                    }
                    else{
                        run_step_log_entry("Reject Elimination.")
                        #iter <- iteration_budget
                        apply_elim <- FALSE
                    }
                                
               ### FEATURE SELECTION PART II ###
               # really eliminate 
                    if(apply_elim){
                                
                        run_step_log_entry("Feature Elimination Applied.")
                        
                        # update iteration_history
                        iteration_history[iter]$IsFeatureEliminated= 1
                        iteration_history[iter]$NumOfEliminated= length(columns_left) - length(planned_columns_left)
                
                        columns_left = planned_columns_left
                        eliminated_columns =  rbind(eliminated_columns,candidate_eliminated_columns)   # rbind edilmeli


                        run_step_log_entry("Eliminated Columns Recorded.")
                    
                    }         
               }
              ### FEATURE SELECTION ENDS ###
            
              # add labeled candidates to the train data
              training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")],use.names = TRUE)
              # update iteration_history
              iteration_history[iter]$IsDataSelected= 1

                run_step_log_entry("Labeled Data Added to Training Set.")
  
            }
            fwrite(iteration_history[iter],paste0(outputs.path,sample.folder,model.type,"_iteration_history_",sample.type,".csv"),append = TRUE )       

            run_step_log_entry(paste0("Iteration history Updated. Iteration ", iter, " Ends."))

            iter = iter + 1

        }
        
        run_step_log_entry("Final Train Data File Recorded.")
    
        fwrite(data.table(training_set_Ad, "seed" = i,"rep" = r),paste0(outputs.path,sample.folder,model.type,"_FinalTrainData_",sample.type,".csv") ,append = TRUE)

        run_step_log_entry("Eliminated Columns File Recorded.")

        fwrite(data.table("seed" = i,"rep" = r, "elim_cols" =  eliminated_columns),paste0(outputs.path,sample.folder,model.type,"_EliminatedColumns_",sample.type,".csv") ,append = TRUE)

        run_step_log_entry(paste0("seed : ",i,"   rep : ", r,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))       
                       
    ##    print(paste0("seed : ",i,"   rep : ", r,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))
    }
    
    run_step_log_entry(paste0("seed : ",i,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))                 
                     
##    print(paste0("seed : ",i,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))
    #rm(training_set_Ad,predictedLabels_table,train_candidates_table)      
}

## Quit NL

In [None]:
NLQuit(nl.obj = nl.model)

In [None]:
sessionInfo()