In [1]:
using CSV, DataFrames, Statistics;

# Training OCT

## Read data

In [3]:
COMPETITION_PATH = "../data/competition.csv"
MODEL_INPUT_PATH = "../data/model_input.csv"
TRAINING_DATA_PATH = "../data/all_games_7_players.csv"

historic_data = CSV.read(TRAINING_DATA_PATH)
X = historic_data[:, 1:end-1]
y = historic_data[:, :fp_p1]

all_columns = names(X)

exclude_players_cols = all_columns
# exclude_players_cols = all_columns[.!endswith.(string.(all_columns),["p6"])]
# exclude_players_cols = exclude_players_cols[.!endswith.(string.(exclude_players_cols),["p7"])]
# exclude_players_cols = exclude_players_cols[.!endswith.(string.(exclude_players_cols),["p13"])]
# exclude_players_cols = exclude_players_cols[.!endswith.(string.(exclude_players_cols),["p14"])]

p1_cols = exclude_players_cols[endswith.(string.(all_columns), "_p1")];

fp_avg_cols =  exclude_players_cols[occursin.("fp_seas_avg", string.(all_columns))];

fp_l_cols =  exclude_players_cols[occursin.("fp_l5", string.(all_columns))];

train_cols = unique(vcat(p1_cols, fp_avg_cols, fp_l_cols));

X = X[:,train_cols];

(train_X, train_y), (test_valid_X, test_valid_y) = IAI.split_data(:regression, X, y, seed=1, train_proportion=train_proportion);
(valid_X, valid_y), (test_X, test_y) = IAI.split_data(:regression, test_valid_X, test_valid_y, seed=1, train_proportion=validation_proportion/(1-train_proportion));

# Default learner
default_lnr = IAI.OptimalTreeRegressor(
    random_seed=1,
    criterion=:mse,
    minbucket=10
    );

# Grid
grid = IAI.GridSearch(default_lnr,
    max_depth=7:13,
    cp=[0.0001, 0.00001]
);

print(grid)

# Fitting the grid
IAI.fit!(grid, train_X, train_y, valid_X, valid_y);

lnr = IAI.get_learner(grid);

# Retrieving best parameters
best_params = IAI.get_best_params(grid)
println(best_params)

grid_results = IAI.get_grid_results(grid)
println(grid_results)

var_importance = IAI.variable_importance(lnr)
println(var_importance)

train_accuracy = IAI.score(lnr,train_X, train_y, criterion=:mse);
valid_accuracy = IAI.score(lnr,valid_X, valid_y, criterion=:mse);
test_accuracy = IAI.score(lnr,test_X, test_y, criterion=:mse);
train_MAE = mean(abs.(IAI.predict(lnr, train_X) - train_y));
valid_MAE = mean(abs.(IAI.predict(lnr, valid_X) - valid_y));
test_MAE = mean(abs.(IAI.predict(lnr, test_X) - test_y));

println(string("Train R2 : ", train_accuracy))
println(string("Train MAE : ", train_MAE))

println(string("Valid R2 : ", valid_accuracy))
println(string("Valid MAE : ", valid_MAE))

println(string("Test R2 : ", test_accuracy))
println(string("Test MAE : ", test_MAE))


IAI.write_html("../processed/OCTs/all_players_1.html", lnr);
IAI.write_json("../processed/OCTs/all_players_1.json", lnr);

# train_X[:, :fp_p1] = convert(Array,train_y)
# valid_X[:, :fp_p1] = convert(Array,valid_y)
# test_X[:, :fp_p1] = convert(Array,test_y)
# CSV.write("../processed/OCTs/train_5_10.csv", train_X)
# CSV.write("../processed/OCTs/valid_5_10.csv", valid_X)
# CSV.write("../processed/OCTs/test_5_10.csv", test_X)

lnr = IAI.read_json("../processed/OCTs/all_players_1.json")

"../data/all_games_7_players.csv"

Unnamed: 0_level_0,player_id_p1,fgm_seas_avg_p1,fgm_seas_avg_p2,fgm_seas_avg_p3,fgm_seas_avg_p4
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64
1,robinna01,6.06757,4.65672,7.8481,5.54878
2,bryanko01,9.7561,4.84416,4.39726,7.30864
3,flynnjo01,0.0,7.08861,2.42424,6.35714
4,wallara01,4.57576,7.98701,6.31646,2.97826
5,howardw01,7.08861,2.42424,6.35714,6.05063
6,flynnjo01,0.0,9.7,3.90123,0.0
7,wadedw01,10.8101,7.0,3.63889,5.47887
8,westbru01,5.31707,8.93243,5.08333,7.54545
9,jamesle01,9.74074,6.5942,5.46341,3.13115
10,paulch01,8.08974,3.34146,5.28049,0.591549


# Precitions on new data (with variance)

In [4]:
# Given a trained tree lnr, and a matrix of , return the mean and std of each node populated by the injected features
function get_nodes_mean_std(lnr, train_X, train_y; only_leafs = false)
    if only_leafs
        leafs = get_leafs(lnr)
        nodes_elements = IAI.apply_nodes(lnr, train_X)[leafs]
    else
        nodes_elements = IAI.apply_nodes(lnr, train_X)
    end
    nb_nodes = length(nodes_elements)
    stds = zeros(length(nodes_elements))
    means = zeros(length(nodes_elements))
    for i=1:nb_nodes
        stds[i] = std(Vector(train_y)[nodes_elements[i]])    
        means[i] = mean(Vector(train_y)[nodes_elements[i]])
    end
    means, stds
end;

In [5]:
function get_leafs(lnr)
    num_nodes = IAI.get_num_nodes(lnr)
    leafs = []
    for i=1:num_nodes
        if IAI.is_leaf(lnr, i)
            push!(leafs, i)
        end
    end
    leafs
end;

In [6]:
function get_leaf_pred_mean_std(lnr, new_X, train_X, train_y)
    nodes_means, nodes_stds = get_nodes_mean_std(lnr, train_X, train_y);
    predictions = IAI.predict(lnr, new_X)  
    assigned_leafs = IAI.apply(lnr, new_X)
    leafs_means = nodes_means[assigned_leafs]
    leafs_stds = nodes_stds[assigned_leafs]
    DataFrame(assigned_leaf = assigned_leafs,
        prediciton = predictions,
        leaf_mean = leafs_means,
        leaf_std = leafs_stds
    )
end;

In [9]:
pred_mean_std = get_leaf_pred_mean_std(lnr, test_X, train_X, train_y)

Unnamed: 0_level_0,assigned_leaf,prediciton,leaf_mean,leaf_std
Unnamed: 0_level_1,Int64,Float64,Float64,Float64
1,129,41.5229,41.5229,10.8626
2,7,33.7634,33.7634,11.255
3,37,39.6118,39.6118,11.1969
4,213,69.7067,69.7067,10.2884
5,220,61.0094,61.0094,12.2355
6,37,39.6118,39.6118,11.1969
7,107,45.576,45.576,11.5914
8,72,63.3667,63.3667,16.8405
9,65,45.1506,45.1506,11.5884
10,97,50.1294,50.1294,12.3594


In [58]:
CSV.write("../processed/results.csv", pred_mean_std)

"../processed/results.csv"