In [None]:

using Random
using CSV
using MLJ
using DataFrames
import DataFramesMeta as DFM
using DelimitedFiles
using MLJScikitLearnInterface
#df = DataFrame(CSV.File(ARGS[1]))
#df = DataFrame("../../features.csv")
using Logging

#logger = ConsoleLogger(stderr, Logging.Debug)
#logger = ConsoleLogger(stderr, Logging.Info)
#logger = ConsoleLogger(stderr, Logging.LogLevel(5000))
#global_logger(logger)


In [None]:
features_csv = "../../data/features.csv"
data, header = readdlm(features_csv, ',', header=true)
df = DataFrame(data, vec(header))
for col in names(df)
    println(col)
end

In [None]:
select!(df, Not([
    :CountHandWood,
    :CountHandBrick,
    :CountHandPasture,
    :CountHandStone,
    :CountHandGrain,
    :HasMostPoints,
    :CountVictoryPoint
    ]))

In [None]:


coerce!(df, :WonGame => Multiclass{2})
df = DFM.@transform(df, :WonGame)
df, df_test = partition(df, 0.1, rng=123)

y, X = unpack(df, ==(:WonGame));
y_test, X_test = unpack(df_test, ==(:WonGame));

In [None]:
function load_tree_model()
    @load RandomForestClassifier pkg=BetaML verbosity=0
end

Tree = load_tree_model()
tree = Base.invokelatest(Tree,
    max_depth = 6,
    min_gain = 0.0,
    min_records = 2,
    max_features = 0,
    splitting_criterion = BetaML.Utils.gini)

In [None]:
mach = machine(tree, X, y)
Base.invokelatest(fit!, mach)


In [None]:
function analyze_acc(mach, X, y)
    println(typeof(mach), typeof(X))
    p = predict(mach, X)
    yhat = mode.(p)
    acc = accuracy(yhat, y)
    return acc
end

acc = analyze_acc(mach, X, y)
test_acc = analyze_acc(mach, X_test, y_test)
println("acc / test_acc: $acc / $test_acc")

In [None]:
p = predict(mach, X_test)

In [None]:
p[1].

In [None]:
collect(p)[1]

In [None]:
pdf(p, 0.0)

In [None]:
v = coerce([0.0, 1.0], Multiclass)
levels(v)

In [None]:
M = pdf(p, levels(v))
hcat(M, y_test)

In [None]:
function get_confusion(M, y_true, thresh = 0.5)
    d = Dict([:fp => 0, :fn => 0, :tn => 0, :tp => 0])
    n = size(M,1)
    for i=1:n 
        if M[i,2] > thresh
            if y_true[i] == 1.0
                d[:tp] += 1
            else
                d[:fp] += 1
            end
        else
            if y_true[i] == 1.0
                d[:fn] += 1
            else
                d[:tn] += 1
            end
        end
    end
    return d
end

In [None]:
get_confusion(M, y_test, 0.5)

In [None]:
measures("FScore")
m = MulticlassFScore()
m(mode.(p), y_test)

In [None]:
predict(mach, X_test)[1:3]
e = evaluate!(mach, resampling=CV(nfolds=6), measures=[m, BalancedAccuracy(adjusted=true)])

In [None]:
e.measurement

In [None]:
#r = range(tree, :threshold, lower=0.1, upper=0.9)
tuned_tree = TunedModel(
    models=[tree],
    tuning=Explicit(),#RandomSearch(),
    resampling=CV(nfolds=6),
    range = nothing,#r,
    measure=m,
    n=30
)
mach2 = machine(tuned_tree, X, y) |> fit!
optimized_tree = report(mach2).best_model
optimized_tree.threshold

In [None]:
tree