# Exploration and Exploitation

In [1]:
using PGFPlots
using Interact
using Reactive
using Distributions
include("helpers.jl")
include("bandits.jl");

## Multi-Armed Bandit Problems

In [2]:
using Random
Random.seed!(2)
arms = 3
b = Bandit(arms)
banditTrial(b)

3: "map(foldp(input))" = 0 wins out of 0 tries (NaN percent) String 

6: "map(foldp(input-2))" = 0 wins out of 0 tries (NaN percent) String 

9: "map(foldp(input-3))" = 0 wins out of 0 tries (NaN percent) String 

11: "map(input-4)" =  String 

## Bayesian Model Estimation

In [3]:
Random.seed!(3)
arms = 2
b = Bandit(arms)
banditEstimation(b)

16: "map(foldp(input-5))" = 0 wins out of 0 tries (NaN percent) String 

17: "map(foldp(input-6))" = 0 wins out of 0 tries (NaN percent) String 

18: "map(foldp(input-5), foldp(input-6))" = Axis(PGFPlots.Plots.Plot[Linear(Real[0.0 0.010101 0.020202 0.030303 0.040404 0.0505051 0.0606061 0.0707071 0.0808081 0.0909091 0.10101 0.111111 0.121212 0.131313 0.141414 0.151515 0.161616 0.171717 0.181818 0.191919 0.20202 0.212121 0.222222 0.232323 0.242424 0.252525 0.262626 0.272727 0.282828 0.292929 0.30303 0.313131 0.323232 0.333333 0.343434 0.353535 0.363636 0.373737 0.383838 0.393939 0.40404 0.414141 0.424242 0.434343 0.444444 0.454545 0.464646 0.474747 0.484848 0.494949 0.505051 0.515152 0.525253 0.535354 0.545455 0.555556 0.565657 0.575758 0.585859 0.59596 0.606061 0.616162 0.626263 0.636364 0.646465 0.656566 0.666667 0.676768 0.686869 0.69697 0.707071 0.717172 0.727273 0.737374 0.747475 0.757576 0.767677 0.777778 0.787879 0.79798 0.808081 0.818182 0.828283 0.838384 0.848485 0.858586 0.868687 0.878788 0.888889 0.89899 0.909091 0.919192 0.929293 0.939394 0.949495 0.959596 0.969697 0.979798 0.989899 1.0; 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

20: "map(input-7)" =  String 

## Ad Hoc Exploration Strategies

In [4]:
# Select random action with probability eps, otherwise greedy
struct EpsGreedy <: BanditPolicy
    eps::Real 
end
function arm(b::EpsGreedy, s::BanditStatistics)
    if rand() < b.eps
        D = DiscreteUniform(1, numArms(s))
        return rand(D)
    else
        return argmax(winProbabilities(s))
    end
end;

In [5]:
# Select arm with probability proportional to exp(precision*winprobability)
struct SoftMax <: BanditPolicy
    precision::Real 
end
function arm(b::SoftMax, s::BanditStatistics)
    p = exp.(b.precision * winProbabilities(s))
    p = p / sum(p)
    D = Categorical(p)
    return rand(D)
end;

In [6]:
# Select arm with highest alpha upper confidence bound
struct IntervalExploration <: BanditPolicy
    alpha::Real
end
function arm(b::IntervalExploration, s::BanditStatistics)
    i = argmax([quantile(Beta(s.numWins[i] + 1, s.numTries[i] - s.numWins[i] + 1), b.alpha) for i in 1:length(s.numWins)]) 
end;

In [7]:
steps = 50
iterations = 1000
bandit = Bandit(collect(0.1:0.2:1))
# bandit = Bandit(collect(1:-0.2:0.1))
epsgreedy = 0.1
softmax = 2
interval = 0.05
@manipulate for epsgreedy in 0:0.1:1, softmax in 0:2:40, interval in 0.5:0.05:1
    epsGreedyResults = simulateAverage(bandit, EpsGreedy(epsgreedy), steps=steps, iterations = iterations)./collect(1:steps)
    softMaxResults = simulateAverage(bandit, SoftMax(softmax), steps=steps, iterations = iterations)./collect(1:steps)
    intervalResults = simulateAverage(bandit, IntervalExploration(interval), steps=steps, iterations = iterations)./collect(1:steps)
    Axis([
        Plots.Linear(epsGreedyResults, legendentry="eps greedy", style="very thick", mark="none"),
        Plots.Linear(softMaxResults, legendentry="softmax", style="very thick", mark="none"),  
        Plots.Linear(intervalResults, legendentry="interval", style="very thick", mark="none")  
        ], style="legend pos=south east", ymin=0, ymax=1, xmin=0, xmax=steps, xlabel="Pulls", ylabel="Average success")
end