In [None]:
# Use value iteration to solve

using POMDPs
using POMDPModels
using POMDPTools

using CSV
using DataFrames
using Random

using DiscreteValueIteration
using Plots

display("Begin MDP Value Iteration")

df = DataFrame(CSV.File("handdrawn_MDPSubterranean.csv"))
df_Matrix = Matrix(df)
state_row = df_Matrix[:,3]
state_col = df_Matrix[:,2]

reward_Vec = df_Matrix[:,6]
reward_function = Dict(GWPos(state_row[1], state_col[1]) => reward_Vec[1]) 
terminate = Dict(GWPos(5, 3) => 100)

for i = 2:length(state_row)
    reward_function[GWPos(state_row[i], state_col[i])] = reward_Vec[i]
end    


mdp = SimpleGridWorld(
    size = (10,10),
    rewards = reward_function,
    terminate_from = Set(keys(terminate)),
    tprob = 0.9,
    discount = 0.9
)

solver = ValueIterationSolver(max_iterations=100, belres=1e-6, verbose=true); # creates the solver
policy = solve(solver, mdp); # runs value iterations

# policy = RandomPolicy(mdp) # Generate a random Policy

render(mdp; policy)

In [None]:
# Simulate the Policy at each step

using POMDPs
using POMDPModels
using POMDPTools
using ElectronDisplay
ElectronDisplay.CONFIG.single_window = true

ds = DisplaySimulator()
simulate(ds, mdp, policy)

In [None]:
# Q Learning

using DeepQLearning
using POMDPs
using Flux
using POMDPModels
using POMDPSimulators
using POMDPTools

# load MDP model from POMDPModels or define your own!
#mdp = SimpleGridWorld();
display("Begin MDP Q Learning")

# Define the Q network (see Flux.jl documentation)
# the gridworld state is represented by a 2 dimensional vector.
model = Chain(Dense(2, 32), Dense(32, length(actions(mdp))))

exploration = EpsGreedyPolicy(mdp, LinearDecaySchedule(start=1.0, stop=0.01, steps=10000/2))

solver_Q = DeepQLearningSolver(qnetwork = model, max_steps=10000, 
                             exploration_policy = exploration,
                             learning_rate=0.005,log_freq=500,
                             recurrence=false,double_q=true, dueling=true, prioritized_replay=true)
policy_Q = solve(solver_Q, mdp)

# render(mdp) # Could not get render to work but the rest works

In [None]:
# Return the optimal value at state s

v = zeros(100)
k = 1
for i = 1:10
    for j = 1:10        
        v[k] = value(policy, [j,i]) # returns the optimal value at state s
        k = k + 1
    end
end
v = (r = v)