In [40]:
using CSV, Statistics, DataFrames;

In [41]:
# Given a trained tree lnr, and a matrix of , return the mean and std of each node populated by the injected features
function get_nodes_mean_std(lnr, train_X, train_y; only_leafs = false)
    if only_leafs
        leafs = get_leafs(lnr)
        nodes_elements = IAI.apply_nodes(lnr, train_X)[leafs]
    else
        nodes_elements = IAI.apply_nodes(lnr, train_X)
    end
    nb_nodes = length(nodes_elements)
    stds = zeros(length(nodes_elements))
    means = zeros(length(nodes_elements))
    for i=1:nb_nodes
        stds[i] = std(Vector(train_y)[nodes_elements[i]])    
        means[i] = mean(Vector(train_y)[nodes_elements[i]])
    end
    means, stds
end;

function get_leafs(lnr)
    num_nodes = IAI.get_num_nodes(lnr)
    leafs = []
    for i=1:num_nodes
        if IAI.is_leaf(lnr, i)
            push!(leafs, i)
        end
    end
    leafs
end;

function get_leaf_pred_mean_std(lnr, new_X, train_X, train_y)
    nodes_means, nodes_stds = get_nodes_mean_std(lnr, train_X, train_y);
    predictions = IAI.predict(lnr, new_X[:,names(train_X)])  
    assigned_leafs = IAI.apply(lnr, new_X[:,names(train_X)])
    leafs_means = nodes_means[assigned_leafs]
    leafs_stds = nodes_stds[assigned_leafs]
    DataFrame(
        player_id = new_X[:,:player_id_p1],
        assigned_leaf = assigned_leafs,
        prediction = predictions,
        leaf_mean = leafs_means,
        leaf_std = leafs_stds
    )
end;

function run_leaf_pred_mean_std(tree_path, training_data_path, model_input_path, competition_path)
    lnr = IAI.read_json(tree_path);
    training_data = CSV.read(training_data_path);
    train_X = training_data[:,1:end-1]
    train_y = training_data[:,end]
    new_X = CSV.read(model_input_path)
    competition = CSV.read(competition_path)
    pred_mean_std = get_leaf_pred_mean_std(lnr, new_X, train_X, train_y)
    optimization_input = join(competition, pred_mean_std, on=:player_id, kind=:right)
    return optimization_input
end;

function run_multiple_leaf_pred_mean_std(tree_path, training_data_path, paths)
    println("Reading Tree ...")
    lnr = IAI.read_json(tree_path);
    println("Reading Training Data ...")
    training_data = CSV.read(training_data_path);
    train_X = training_data[:,1:end-1]
    train_y = training_data[:,end]
    println("Getting nodes mean and std ...")
    nodes_means, nodes_stds = get_nodes_mean_std(lnr, train_X, train_y);
    for i=1:size(paths)[1]
        println(string("Reading files ",i," ..."))
        model_input_path = paths[i,:model_input_paths]
        competition_path = paths[i,:competition_paths]
        output_path = paths[i,:output_paths]

        new_X = CSV.read(model_input_path)
        competition = CSV.read(competition_path)
        println(string("Predicting for file ",i," ..."))
        predictions = IAI.predict(lnr, new_X[:,names(train_X)])
        println(string("Getting node mean and str for file ",i," ..."))
        assigned_leafs = IAI.apply(lnr, new_X[:,names(train_X)])
        leafs_means = nodes_means[assigned_leafs]
        leafs_stds = nodes_stds[assigned_leafs]
        
        pred_mean_std= DataFrame(
            player_id = new_X[:,:player_id_p1],
            assigned_leaf = assigned_leafs,
            prediction = predictions,
            leaf_mean = leafs_means,
            leaf_std = leafs_stds)        
        
        optimization_input = join(competition, pred_mean_std, on=:player_id, kind=:right)
            
        println(string("Outputting file ",i," ..."))
        CSV.write(output_path, optimization_input)
    end
end;

In [42]:
paths = DataFrame(keys = String[], competition_paths = String[], model_input_paths = String[], output_paths = String[])
for (root, dirs, files) in walkdir("../data/competitions_clean/")
    for file in files
        if occursin("_fake", file)
            # Keys
            key = split(file,"_")[1]
            # Paths
            competition_path = string("../data/competitions_clean/",key,"_fake.csv")
            model_input_paths = string("../data/model_inputs/",key,"_fake_dup.csv")
            output_paths = string("../data/optimization_inputs/",key,"_fake.csv")
            push!(paths, [key, competition_path, model_input_paths, output_paths])
        end
    end
end
sort!(paths);

In [43]:
tree_path = "../processed/OCTs/OCT.json";
training_data_path = "../processed/OCTs/oct_train_data.csv";

In [44]:
run_multiple_leaf_pred_mean_std(tree_path, training_data_path, paths)

Reading Tree ...
Reading Training Data ...
Getting nodes mean and std ...
Reading files 1 ...
Predicting for file 1 ...
Getting node mean and str for file 1 ...
Outputting file 1 ...
Reading files 2 ...
Predicting for file 2 ...
Getting node mean and str for file 2 ...
Outputting file 2 ...
Reading files 3 ...
Predicting for file 3 ...
Getting node mean and str for file 3 ...
Outputting file 3 ...
Reading files 4 ...
Predicting for file 4 ...
Getting node mean and str for file 4 ...
Outputting file 4 ...
Reading files 5 ...
Predicting for file 5 ...
Getting node mean and str for file 5 ...
Outputting file 5 ...
Reading files 6 ...
Predicting for file 6 ...
Getting node mean and str for file 6 ...
Outputting file 6 ...
Reading files 7 ...
Predicting for file 7 ...
Getting node mean and str for file 7 ...
Outputting file 7 ...
Reading files 8 ...
Predicting for file 8 ...
Getting node mean and str for file 8 ...
Outputting file 8 ...
Reading files 9 ...
Predicting for file 9 ...
Getting 