In [29]:
using CSV, Statistics, DataFrames;

In [None]:
# Given a trained tree lnr, and a matrix of , return the mean and std of each node populated by the injected features
function get_nodes_mean_std(lnr, train_X, train_y; only_leafs = false)
    if only_leafs
        leafs = get_leafs(lnr)
        nodes_elements = IAI.apply_nodes(lnr, train_X)[leafs]
    else
        nodes_elements = IAI.apply_nodes(lnr, train_X)
    end
    nb_nodes = length(nodes_elements)
    stds = zeros(length(nodes_elements))
    means = zeros(length(nodes_elements))
    for i=1:nb_nodes
        stds[i] = std(Vector(train_y)[nodes_elements[i]])    
        means[i] = mean(Vector(train_y)[nodes_elements[i]])
    end
    means, stds
end;

function get_leafs(lnr)
    num_nodes = IAI.get_num_nodes(lnr)
    leafs = []
    for i=1:num_nodes
        if IAI.is_leaf(lnr, i)
            push!(leafs, i)
        end
    end
    leafs
end;

function get_leaf_pred_mean_std(lnr, new_X, train_X, train_y)
    nodes_means, nodes_stds = get_nodes_mean_std(lnr, train_X, train_y);
    predictions = IAI.predict(lnr, new_X[:,names(train_X)])  
    assigned_leafs = IAI.apply(lnr, new_X[:,names(train_X)])
    leafs_means = nodes_means[assigned_leafs]
    leafs_stds = nodes_stds[assigned_leafs]
    DataFrame(
        player_id = new_X[:,:player_id_p1],
        assigned_leaf = assigned_leafs,
        prediction = predictions,
        leaf_mean = leafs_means,
        leaf_std = leafs_stds
    )
end;

function run_leaf_pred_mean_std(tree_path, training_data_path, model_input_path, competition_path)
    lnr = IAI.read_json(tree_path);
    training_data = CSV.read(training_data_path);
    train_X = training_data[:,1:end-1]
    train_y = training_data[:,end]
    new_X = CSV.read(model_input_path)
    competition = CSV.read(competition_path)
    pred_mean_std = get_leaf_pred_mean_std(lnr, new_X, train_X, train_y)
    optimization_input = join(competition, pred_mean_std, on=:player_id, kind=:right)
    return optimization_input
end;

function run_multiple_leaf_pred_mean_std(tree_path, training_data_path, paths)
    model_input_path = paths[i,:model_input_paths]
    competition_path = paths[i,:competition_paths]
    output_path = paths[i,:output_paths]
    lnr = IAI.read_json(tree_path);
    training_data = CSV.read(training_data_path);
    train_X = training_data[:,1:end-1]
    train_y = training_data[:,end]
    new_X = CSV.read(model_input_path)
    competition = CSV.read(competition_path)
    pred_mean_std = get_leaf_pred_mean_std(lnr, new_X, train_X, train_y)
    optimization_input = join(competition, pred_mean_std, on=:player_id, kind=:right)
    CSV.write(output_path, optimization_input)
end;


In [68]:
TREE_PATH = "../processed/OCTs/OCT.json"
TRAINING_DATA_PATH = "../processed/OCTs/oct_train_data.csv"

"../data/optimization_input.csv"

In [87]:
paths = DataFrame(keys = String[], competition_paths = String[], model_input_paths = String[], output_paths = String[])
for (root, dirs, files) in walkdir("../data/competitions_clean/")
    for file in files
        if occursin("_fake", file)
            # Keys
            key = split(file,"_")[1]
            # Paths
            competition_path = string("../data/competitions_clean/",key,"_fake.csv")
            model_input_paths = string("../data/model_inputs/",key,"_fake_dup.csv")
            output_paths = string("../data/optimization_inputs/",key,"_fake.csv")
            push!(paths, [key, competition_path, model_input_paths, output_paths])
        end
    end
end
sort!(paths)

Unnamed: 0_level_0,keys,competition_paths,model_input_paths
Unnamed: 0_level_1,String,String,String
1,2012-03-01,../data/competitions_clean/2012-03-01_fake.csv,../data/model_inputs/2012-03-01_fake_dup.csv
2,2013-02-07,../data/competitions_clean/2013-02-07_fake.csv,../data/model_inputs/2013-02-07_fake_dup.csv
3,2013-03-05,../data/competitions_clean/2013-03-05_fake.csv,../data/model_inputs/2013-03-05_fake_dup.csv
4,2013-10-30,../data/competitions_clean/2013-10-30_fake.csv,../data/model_inputs/2013-10-30_fake_dup.csv
5,2013-11-17,../data/competitions_clean/2013-11-17_fake.csv,../data/model_inputs/2013-11-17_fake_dup.csv
6,2014-11-17,../data/competitions_clean/2014-11-17_fake.csv,../data/model_inputs/2014-11-17_fake_dup.csv


In [None]:
run_multiple_leaf_pred_mean_std(tree_path, training_data_path, paths)