In [1]:
using EdgeFlip
include("../greedy_policy.jl")

Main.GreedyPolicy

In [2]:
nref = 1
nflips = 8
maxflips = ceil(Int,1.2nflips)

10

In [3]:
env = EdgeFlip.GameEnv(nref,nflips,fixed_reset=false,maxflips=maxflips)

GameEnv


## Deploy Greedy Algorithm

In [4]:
num_trajectories = 5000
gd_avg = GreedyPolicy.average_normalized_returns(env, num_trajectories)

0.9075987856587855

## Linear policy using vertex template score

In [18]:
using Flux
using Distributions: Categorical



Main.PolicyGradient

In [20]:
include("../global_policy_gradient.jl")

function PolicyGradient.state(env::EdgeFlip.GameEnv)
    return EdgeFlip.vertex_template_score(env)
end

function PolicyGradient.step!(env::EdgeFlip.GameEnv, action)
    EdgeFlip.step!(env, action)
end

function PolicyGradient.is_terminated(env::EdgeFlip.GameEnv)
    return EdgeFlip.is_terminated(env)
end

function PolicyGradient.reward(env::EdgeFlip.GameEnv)
    return EdgeFlip.reward(env)
end

function PolicyGradient.reset!(env::EdgeFlip.GameEnv)
    EdgeFlip.reset!(env)
end

function PolicyGradient.score(env::EdgeFlip.GameEnv)
    return EdgeFlip.score(env)
end




In [7]:
struct LinearPolicy
    model::Any
    function LinearPolicy()
        model = Dense(4,1)
        new(model)
    end
end

function (p::LinearPolicy)(s)
    return vec(p.model(s))
end

Flux.@functor LinearPolicy

In [13]:
policy = LinearPolicy()

LinearPolicy(Dense(4, 1))

In [14]:
learning_rate = 0.1
batch_size = 32
num_epochs = 1000
num_trajectories = 100

100

In [15]:
epoch_history, return_history = PolicyGradient.run_training_loop(
    env,
    policy,
    batch_size,
    num_epochs,
    learning_rate,
    num_trajectories,
    estimate_every = 100,
)

epoch: 100 	 loss: 7.5835 	 avg return: 0.86
epoch: 200 	 loss: 5.2152 	 avg return: 0.91
epoch: 300 	 loss: 1.3215 	 avg return: 0.88
epoch: 400 	 loss: 1.0646 	 avg return: 0.87
epoch: 500 	 loss: 3.5899 	 avg return: 0.92
epoch: 600 	 loss: 5.8473 	 avg return: 0.92
epoch: 700 	 loss: 3.0621 	 avg return: 0.89
epoch: 800 	 loss: 1.4156 	 avg return: 0.90
epoch: 900 	 loss: 0.7322 	 avg return: 0.90
epoch: 1000 	 loss: 4.2037 	 avg return: 0.91


(Any[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], Any[0.8592665945165945, 0.9095813075813075, 0.8828950216450216, 0.8653055555555557, 0.9151074203574204, 0.9189873737373737, 0.8891199078699078, 0.900023088023088, 0.8966511544011544, 0.9088748196248195])

In [22]:
num_test_trajectories = 1000
nn_avg = PolicyGradient.average_normalized_returns(env, policy, num_test_trajectories)

0.9020641081141081

## Edge template policy

In [23]:
function PolicyGradient.state(env::EdgeFlip.GameEnv)
    vs = EdgeFlip.vertex_template_score(env)
    et = EdgeFlip.edge_template(env)
    return vs, et
end

In [24]:
function edge_state(ep, et, boundary)
   es = [e == 0 ? boundary : ep[e] for e in et] 
end

edge_state (generic function with 1 method)

In [None]:
struct EdgePolicy
    vertex
    edge
    boundary
    function EdgePolicy()
        vertex = Dense(4,1)
        edge = Dense(5,1)
        boundary = Flux.glorot_uniform(1)
        new(vertex, edge, boundary)
    end
end

function (p::EdgePolicy)(state)
   vs, et = state[1], state[2]
    ep = p.vmodel(vs)
    es = edge_state(ep, et, p.boundary[1])
    
    logits = vec(p.edge(es))
    return logits
end