In [1]:
import gym
import torch
import torch.nn as nn
import math
import gym.envs.toy_text.frozen_lake as frozen_lake

In [2]:
default_4x4_map =[ "SFFF", "FHFH", "FFFH", "HFFG" ]
default_8x8_map = [ "SFFFFFFF", "FFFFFFFF", "FFFHFFFF", "FFFFFHFF", "FFFHFFFF", "FHHFFFHF", "FHFFHFHF", "FFFHFFFG" ]

def random_4x4_lake_map():
    return frozen_lake.generate_random_map(size=4, p=torch.rand(1).item())

# First environment

Lets first code an agent with random behaviour to grasp gym 'FrozenLake' environment's api.

In [3]:
# https://gym.openai.com/docs/
env = gym.make('CartPole-v1')
env.reset()
for _ in range(300):
    env.render()
    done = env.step(env.action_space.sample()) # take a random action
    if done : break
env.close()

In [4]:
#parameters
is_slippery = True
lake_map = default_4x4_map
render = True

In [5]:
env = gym.make('FrozenLake-v1', desc=lake_map, is_slippery=is_slippery)

for i_episode in range(5):
    observation = env.reset()
    if render : env.render()
    for t in range(100):
        action = env.action_space.sample() # random action
        observation, reward, done, info = env.step(action)
        if render : env.render()
        if done : break
    print("Episode {} finished after {} timesteps and got {} reward".format(i_episode+1, t+1, reward))        
env.close()

Episode 1 finished after 3 timesteps and got 0.0 reward
Episode 2 finished after 6 timesteps and got 0.0 reward
Episode 3 finished after 5 timesteps and got 0.0 reward
Episode 4 finished after 16 timesteps and got 1.0 reward
Episode 5 finished after 7 timesteps and got 0.0 reward


# Value update / Bellman update

Based on [the first lecture of the deep RL bootcamp](https://www.youtube.com/watch?v=qaMdN6LS9rA) I wanna find the cost of every tile of the map


### Compute qval grid

In [6]:
# my FrozenLake implementation of the bellman update
def fl_bellman_update(lake_map, is_slippery, discount, iterations) :
    shape = (len(lake_map), len(lake_map[0]), 4)
    lake_reward_map = torch.zeros(shape)
    exit_coord = []
    for i in range(shape[0]):
        for j in range(shape[1]):
            if lake_map[i][j] == 'H' :
                lake_reward_map[i, j, :] = -1
                exit_coord.append((i,j))
            if lake_map[i][j] == 'G' :
                lake_reward_map[i, j, :] = 1
                exit_coord.append((i,j))

    for k in range(iterations):
        prev_lake_reward_map = lake_reward_map.clone()
        for i in range(shape[0]):
            for j in range(shape[1]):
                if not any((i,j) == coord for coord in exit_coord):

                    left = prev_lake_reward_map[i, [j-1 if j-1>=0 else j], :].max()
                    down = prev_lake_reward_map[[i+1 if i+1<shape[0] else i], j, :].max()
                    right = prev_lake_reward_map[i, [j+1 if j+1<shape[1] else j], :].max()
                    up = prev_lake_reward_map[[i-1 if i-1>=0 else i], j, :].max()

                    if is_slippery :
                        lake_reward_map[i, j, 0] = discount * (left + down + up) / 3
                        lake_reward_map[i, j, 1] = discount * (down + left + right) / 3 
                        lake_reward_map[i, j, 2] = discount * (right + down + up) / 3
                        lake_reward_map[i, j, 3] = discount * (up + left + right) / 3

                    else :
                        lake_reward_map[i, j, 0] = discount * left
                        lake_reward_map[i, j, 1] = discount * down
                        lake_reward_map[i, j, 2] = discount * right
                        lake_reward_map[i, j, 3] = discount * up
                        
    return lake_reward_map

In [7]:
iterations = 100
is_slippery = True
discount = 0.95
lake_map = default_4x4_map

lake_reward_map = fl_bellman_update(lake_map, is_slippery, discount, iterations)
print(lake_reward_map)

tensor([[[ 0.1301,  0.1186,  0.1186,  0.1121],
         [-0.2458, -0.2525, -0.2640,  0.0938],
         [-0.0178, -0.0209, -0.0277,  0.0724],
         [-0.2739, -0.2739, -0.2771,  0.0625]],

        [[ 0.1507, -0.2072, -0.2137, -0.2278],
         [-1.0000, -1.0000, -1.0000, -1.0000],
         [-0.2223, -0.5619, -0.2223, -0.6104],
         [-1.0000, -1.0000, -1.0000, -1.0000]],

        [[-0.2072, -0.1693, -0.1834,  0.1951],
         [-0.1178,  0.2703, -0.1082, -0.1835],
         [ 0.2255, -0.0208, -0.1768, -0.3015],
         [-1.0000, -1.0000, -1.0000, -1.0000]],

        [[-1.0000, -1.0000, -1.0000, -1.0000],
         [-0.0940,  0.0307,  0.4330, -0.0208],
         [ 0.4188,  0.6641,  0.5983,  0.5252],
         [ 1.0000,  1.0000,  1.0000,  1.0000]]])


### Make simple agent folowing max qval

In [8]:
def fl_get_action(lake_reward_map, observation):
    row_len = lake_reward_map.size()[0]
    x = math.trunc(observation / row_len)
    y = observation % row_len
    
    return lake_reward_map[x, y, :].argmax().item()

In [9]:
env = gym.make('FrozenLake-v1', desc=lake_map, is_slippery=is_slippery)
render = True

for i_episode in range(5):
    observation = env.reset()
    if render : env.render()
    for t in range(100):
        action = fl_get_action(lake_reward_map, observation)
        observation, reward, done, info = env.step(action)
        if render : env.render()
        if done : break
    print("Episode {} finished after {} timesteps and got {} reward".format(i_episode+1, t+1, reward))        
env.close()

Episode 1 finished after 100 timesteps and got 0.0 reward
Episode 2 finished after 29 timesteps and got 1.0 reward
Episode 3 finished after 60 timesteps and got 1.0 reward
Episode 4 finished after 62 timesteps and got 1.0 reward
Episode 5 finished after 100 timesteps and got 0.0 reward


### Move agent inside a class

In [10]:
class Agent():
    def __init__(self, is_slippery:bool=True, lake_map:torch.tensor=default_4x4_map, discount:float=0.95, iterations:int=None) -> None:
        shape = (len(lake_map), len(lake_map[0]))
        area = shape[0]*shape[1]
        if iterations == None :
            iterations = area * 5
            
        self.lake_reward_map = fl_bellman_update(lake_map, is_slippery, discount, iterations)     

    def get_action(self, state: int):
        return fl_get_action(self.lake_reward_map, state)

In [11]:
render = True
lake_map = default_8x8_map
is_slippery = True

env = gym.make('FrozenLake-v1', desc=lake_map, is_slippery=is_slippery)
agent = Agent(is_slippery=is_slippery, lake_map=lake_map)

for i_episode in range(5):
    observation = env.reset()
    if render : env.render()
    for t in range(1000):
        action = agent.get_action(observation)
        observation, reward, done, info = env.step(action)
        if render : env.render()
        if done : break
    print("Episode {} finished after {} timesteps and got {} reward".format(i_episode+1, t+1, reward))        
env.close()

Episode 1 finished after 50 timesteps and got 1.0 reward
Episode 2 finished after 43 timesteps and got 1.0 reward
Episode 3 finished after 41 timesteps and got 1.0 reward
Episode 4 finished after 88 timesteps and got 1.0 reward
Episode 5 finished after 100 timesteps and got 0.0 reward


# Q-Learning

Now that we've implemented the iterative method to get the Q-value we're gonna implement the exploratory method because the iterative method has one huge downside : it evaluates each and every possible move without any optimisation whereas the q-learning algorithm give us the possibility to explore in a smarter, more efficient manner.

Here are some of my references: [toward data science](https://towardsdatascience.com/q-learning-algorithm-from-explanation-to-implementation-cdbeda2ea187), [playing Atari with DRL](https://arxiv.org/pdf/1312.5602.pdf), [second lecture of the RL bootcamp](https://discord.com/channels/@me/966660137215467560/967833369478053988)

In [12]:
# smaller learning_rate_decay_factor means faster decay
# epsylon: 0 => never explore, 1 => random exploration
def fl_qlearn(lake_map, is_slippery, discount, learning_time_steps, learning_rate_decay_factor:float=0, epsylon:float=0.001) :
    shape = (len(lake_map), len(lake_map[0]), 4)
    lake_reward_map = torch.zeros(shape)
    exit_coord = []
    start_coord = (0,0)
    for i in range(shape[0]):
        for j in range(shape[1]):
            if lake_map[i][j] == 'H' :
                lake_reward_map[i, j, :] = -1
                exit_coord.append((i,j))
            if lake_map[i][j] == 'G' :
                lake_reward_map[i, j, :] = 1
                exit_coord.append((i,j))
            if lake_map[i][j] == 'S' :
                start_coord = (i,j)
                
    runner_coord = start_coord
    for t in range(learning_time_steps):
        action = epsylon_greedy(lake_reward_map[runner_coord[0], runner_coord[1]], epsylon)
        new_runner_coord = move(runner_coord, action, shape, is_slippery)
        
        learning_rate = math.e ** (-t*learning_rate_decay_factor/learning_time_steps)
        lake_reward_map[runner_coord[0], runner_coord[1], action] = (
            lake_reward_map[runner_coord[0], runner_coord[1], action]
            + (learning_rate
               * (discount * lake_reward_map[new_runner_coord[0], new_runner_coord[1], :].max()
                  - lake_reward_map[runner_coord[0], runner_coord[1], action]
                 )
              )
        )
        
        if any(new_runner_coord == coord for coord in exit_coord) :
            runner_coord = start_coord
        else :
            runner_coord = new_runner_coord
            
    return lake_reward_map         
    
def move(coord, action, shape, is_slippery):
    action_before_noise = action
    if is_slippery :
        noise = math.trunc(torch.rand(1).item() * 3)
        if noise == 0 :
            action = (3 if action-1<0 else action-1)
        elif noise == 1 :
            action = (0 if action+1>3 else action+1)
        
    i, j = coord[0], coord[1]
    if action == 0 :
        j = (j-1 if j-1>=0 else j)
    elif action == 1 :
        i = i+1 if i+1<shape[0] else i
    elif action == 2 :
        j = j+1 if j+1<shape[1] else j
    else :
        i = i-1 if i-1>=0 else i
        
    return i,j

In [13]:
# exploration actor strategies

def rand_action():
    return math.trunc(torch.rand(1).item() * 4)
    
def optimal_strategy(actions_reward_map):
    return actions_reward_map.argmax().item()

def epsylon_greedy(actions_reward_map, epsylon):
    if actions_reward_map.max() <= 0 or torch.rand(1).item() < epsylon :
        return rand_action()
    else :
        return optimal_strategy(actions_reward_map)

# The highest the curently evaluated future reward, the more likely it is to folow the strategy
def heat_seeking(actions_reward_map, epsylon):
    heat_map = (math.sqrt(1/epsylon) ** actions_reward_map)
    heat = math.trunc(torch.rand(1).item() * heat_map.sum().item())
    cumulative_heat = 0
    for i,h in enumerate(heat_map) :
        cumulative_heat += h
        if heat <= cumulative_heat :
            return i
        
def ibrid_strategy(actions_reward_map, epsylon):
    # should be better tuned
    if actions_reward_map.max() <= 0 or torch.rand(1).item() < epsylon :
        return heat_seeking(actions_reward_map, epsylon)
    else :
        return optimal_strategy(actions_reward_map)

In [14]:
class Agent():
    def __init__(self, is_slippery:bool=True, lake_map:torch.tensor=default_4x4_map, discount:float=0.95, learning_time_steps:int=None, learning_rate:float=0.3) -> None:
        shape = (len(lake_map), len(lake_map[0]))
        area = shape[0]*shape[1]
        if learning_time_steps == None :
            learning_time_steps = area * 10000
            
        self.lake_reward_map = fl_qlearn(lake_map, is_slippery, discount, learning_time_steps, learning_rate) 
        print(self.lake_reward_map)

    def get_action(self, state: int):
        return fl_get_action(self.lake_reward_map, state)

In [15]:
render = True
lake_map = default_4x4_map
is_slippery = True

env = gym.make('FrozenLake-v1', desc=lake_map, is_slippery=is_slippery)
agent = Agent(is_slippery=is_slippery, lake_map=lake_map)

for i_episode in range(5):
    observation = env.reset()
    if render : env.render()
    for t in range(100):
        action = agent.get_action(observation)
        observation, reward, done, info = env.step(action)
        if render : env.render()
        if done : break
    print("Episode {} finished after {} timesteps and got {} reward".format(i_episode+1, t+1, reward))        
env.close()

tensor([[[-1.7516e-04, -4.7012e-09, -1.7402e-04,  1.2612e-44],
         [-3.6753e-02, -1.1112e-03, -2.3763e-01,  1.2612e-44],
         [-6.6676e-03, -2.5718e-04, -1.8007e-05,  1.2612e-44],
         [-1.3068e-03, -1.2220e-03, -9.4600e-01,  1.2612e-44]],

        [[-2.4530e-04, -1.5336e-01, -1.9697e-03, -3.0712e-02],
         [-1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00],
         [-1.9499e-03, -9.2210e-01, -9.4995e-01, -9.4904e-01],
         [-1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00]],

        [[-7.7792e-03, -8.8595e-01, -9.4248e-01,  3.1902e-02],
         [-9.4773e-01, -3.7809e-04, -9.3722e-01, -6.9804e-01],
         [-1.2293e-03,  6.7195e-01, -9.4298e-01, -1.4618e-03],
         [-1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00]],

        [[-1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00],
         [ 4.9487e-03,  0.0000e+00,  6.8145e-01,  0.0000e+00],
         [ 0.0000e+00,  7.7011e-01,  0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00,  1.0000e+00, 

Its not converging, something is wrong.

## Second try

I had some issues with my previous implementation of the q-learning algorithm. When looking at other peoples implementations I realized that mine could be significantly improved in terms of readability and  structure. So I set out to make an other version.

This code is pritty inspired by Juliette's.

In [16]:
#parameters
is_slippery = True
lake_map = default_4x4_map

env = gym.make("FrozenLake-v1", desc=lake_map, is_slippery=is_slippery)
env.reset()
#env.render()

discount = 0.9
rewards = 0
thousand_runs = 10
qtable = torch.zeros((env.observation_space.n, env.action_space.n))

for t in range(thousand_runs * 1000 + 1):
    state = env.reset()
    done = False
    
    if t % 1000 == 0 :
        print('Average reward after {} training runs : {}'.format(t, rewards/1000))
        #print(qtable)
        #print('\n------------------------------------------------------------------\n')
        rewards = 0

    while not done:
        action = epsylon_greedy(qtable[state], 0.005)
        #action = heat_seeking(qtable[state], 0.000000000001)
        learning_rate = math.e ** (-t/10000)

        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + learning_rate * (reward + discount * torch.max(qtable[new_state]) - qtable[state, action])
        state = new_state
        rewards += reward

        #env.render()
        
rewards = 0
for t in range(1000):
    state = env.reset()
    done = False
    
    while not done:
        action = optimal_strategy(qtable[state])
        
        new_state, reward, done, info = env.step(action)
        
        state = new_state
        rewards += reward

        #env.render()
        
print('Average reward after 1000 runs with optimal strategy : {}'.format(rewards/1000))

Average reward after 0 training runs : 0.0
Average reward after 1000 training runs : 0.063
Average reward after 2000 training runs : 0.091
Average reward after 3000 training runs : 0.428
Average reward after 4000 training runs : 0.488
Average reward after 5000 training runs : 0.413
Average reward after 6000 training runs : 0.492
Average reward after 7000 training runs : 0.425
Average reward after 8000 training runs : 0.39
Average reward after 9000 training runs : 0.477
Average reward after 10000 training runs : 0.457
Average reward after 1000 runs with optimal strategy : 0.421


The heat-seeking training policy that I implemented seem less promissing than the classic epsylon-greedy strategy because it is harder to tune the exploration vs optimal policy dilema. Though I think that that type of heat-seeking algorithm could help explore in a smarter way than epsylon-greedy but I bellive the maximum improvement to be limited.


## Experience replay

In [this paper](https://arxiv.org/pdf/1312.5602.pdf) (section 4.0) they describe a technique called *experience replay*. I would like to try implementing it here.

In [17]:
#parameters
is_slippery = True
lake_map = default_4x4_map

env = gym.make("FrozenLake-v1", desc=lake_map, is_slippery=is_slippery)
env.reset()

discount = 0.9
epsylon = 0.001
rewards = 0
thousand_runs = 10
replay_memory = []
replay_memory_max_size = 2000
qtable = torch.zeros((env.observation_space.n, env.action_space.n))

for t in range(thousand_runs * 1000 + 1):
    state = env.reset()
    done = False
    
    if t % 1000 == 0 :
        print('Average reward after {} training runs : {}'.format(t, rewards/1000))
        #print(qtable)
        #print('\n------------------------------------------------------------------\n')
        rewards = 0

    while not done:
        action = epsylon_greedy(qtable[state], epsylon)
        learning_rate = math.e ** (-t/10000)

        new_state, reward, done, info = env.step(action)
        optimal_next_step = torch.max(qtable[new_state])
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + discount * optimal_next_step - qtable[state, action])

        # if the step teaches something to the model then add it to replay memory
        if abs(reward + discount * optimal_next_step) > 0.1 :
            replay_memory.append((state,action,reward,new_state))
        
        state = new_state
        rewards += reward
        
    replay_memory_size = len(replay_memory)
    
    # we keep the memory size at max popping the last added elements
    d = replay_memory_size - replay_memory_max_size
    while d > 0 :
        replay_memory.pop(0)
        d -= 1
            
    # we pick random replays and train the model
    minibatch_size = math.trunc(replay_memory_size/20)
    random_indices = map(math.trunc, torch.rand(minibatch_size).tolist() * replay_memory_max_size)
    replays = [v for i, v in enumerate(replay_memory) if i in random_indices]
    for _, (s,a,r,ns) in enumerate(replays) :
        qtable[s, a] = qtable[s, a] + learning_rate * (r + discount * torch.max(qtable[ns]) - qtable[s, a])

        
rewards = 0
for t in range(1000):
    state = env.reset()
    done = False
    
    while not done:
        action = optimal_strategy(qtable[state])
        
        new_state, reward, done, info = env.step(action)
        
        state = new_state
        rewards += reward
        
print('Average reward after 1000 runs with optimal strategy : {}'.format(rewards/1000))

Average reward after 0 training runs : 0.0
Average reward after 1000 training runs : 0.028
Average reward after 2000 training runs : 0.031
Average reward after 3000 training runs : 0.179
Average reward after 4000 training runs : 0.301
Average reward after 5000 training runs : 0.518
Average reward after 6000 training runs : 0.488
Average reward after 7000 training runs : 0.567
Average reward after 8000 training runs : 0.581
Average reward after 9000 training runs : 0.543
Average reward after 10000 training runs : 0.542
Average reward after 1000 runs with optimal strategy : 0.628


Afterwards I realised that I misunderstood experiance-replay. It isn't used to be more efficient in discovering the Q-Table but to train a DNN that will aproximate the function that finds the q-value. The huge benefit is that we will be able to generalize this DNN even for any action space.

# Deep Q-Learning

Now that I get how Q-learning works lets get into DQN. My main resource here is [this toward data science article](https://towardsdatascience.com/deep-q-networks-theory-and-implementation-37543f60dd67).

In [32]:
# model for 4x4 grids
class ModelV0(nn.Module):
    def __init__(self):
        super(ModelV0, self).__init__()
        self.cnn = nn.Sequential(
             nn.Linear(17, 128),
             nn.ReLU(),
             nn.Linear(128, 32),
             nn.ReLU(),
             nn.Linear(32, 4)
        )
        
    def forward(self, state, lake_tensor): 
        x = torch.cat((torch.as_tensor([state]), lake_tensor))
        x = self.cnn(x)  
        return x

In [41]:
def lake_map_to_tensor(lake_map):
    lake_map = ''.join(lake_map)
    mytensor = torch.zeros(len(lake_map), requires_grad=False)
    for i,v in enumerate(lake_map):
        if v == 'H' :
            mytensor[i] = -1
        if v == 'G' :
            mytensor[i] = 1
    return mytensor

In [42]:
#parameters
is_slippery = True
discount = 0.9
epsylon = 0.01
thousand_runs = 10
replay_memory = []
replay_memory_max_size = 2000

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelV0().to(device)
loss_fn = nn.MSELoss()


### TRAINING ###

rewards = 0
for t in range(thousand_runs * 1000 + 1):
    lake_map = random_4x4_lake_map()
    env = gym.make("FrozenLake-v1", desc=lake_map, is_slippery=is_slippery)

    if t % 1000 == 0 :
        print('Average reward after {} training runs : {}'.format(t, rewards/1000))
        rewards = 0

    state = env.reset()
    map_tensor = lake_map_to_tensor(lake_map)
    done = False
    while not done:
        actions_weights = model(state, map_tensor)
        action = rand_action()
        if not (actions_weights.max() <= 0 or torch.rand(1).item() < epsylon) :
            action = actions_weights.argmax().item()   
        action_weight = actions_weights[action].item()
        new_state, reward, done, info = env.step(action)
        next_step_best_action_weight = reward + discount * model(new_state, map_tensor).max()
        
        if abs(discount * next_step_best_action_weight + reward) > 0.1 :
            replay_memory.append((action,actions_weights,next_step_best_action_weight))

        state = new_state
        rewards += reward

        
    replay_memory_size = len(replay_memory)

    # we keep the memory size at max popping the last added elements
    d = replay_memory_size - replay_memory_max_size
    while d > 0 :
        replay_memory.pop(0)
        d -= 1

    # we pick random replays and train the model
    learning_rate = math.e ** (-t/(thousand_runs * 1000))
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    minibatch_size = math.trunc(replay_memory_size/20)
    random_indices = map(math.trunc, torch.rand(minibatch_size).tolist() * replay_memory_max_size)
    replays = [v for i, v in enumerate(replay_memory) if i in random_indices]
    for _, (a,aw,nv) in enumerate(replays) :
        naw = aw.clone()
        naw[a] = nv
        loss = loss_fn(aw, naw)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

                                
### TESTING ###
        
rewards = 0
for t in range(1000):
    lake_map = random_4x4_lake_map()
    env = gym.make("FrozenLake-v1", desc=lake_map, is_slippery=is_slippery)
    
    map_tensor = lake_map_to_tensor(lake_map)
    state = env.reset()
    done = False
    while not done:
        action = model(state, lake_map)
        new_state, reward, done, info = env.step(action)
        state = new_state
        rewards += reward
        
                                
env.close()
print('Average reward after 1000 runs with optimal strategy : {}'.format(rewards/1000))

Average reward after 0 training runs : 0.0


KeyboardInterrupt: 

## Rework

We reviewed the code above with [ezalos](https://github.com/ezalos) and he made it significantly better by moving everithing into a trainer class.

In [5]:
from dqn_trainer import MyTrainer

Now we can use it to itterate our model while saving the results :

In [9]:
class Model00(nn.Module):
    def __init__(self):
        super(Model00, self).__init__()
        self.dense = nn.Sequential(
             nn.Linear(17, 128),
             nn.ReLU(),
             nn.Linear(128, 32),
             nn.ReLU(),
             nn.Linear(32, 4)
        )

    def forward(self, state, lake_tensor):
        x = torch.cat((torch.as_tensor([state]), lake_tensor))
        x = self.dense(x)
        return x
    
model = Model00()
param_dict = {
    "is_slippery" : True,
    "randomize_lake_map" : False,
    "discount" : 0.9,
    "epsylon" : 0.01,
    "hundred_runs" : 10,
    "replay_memory_max_size" : 2000,
    "replay_regularity" : 20,
    "model" : model,
    "loss_fn" : nn.MSELoss(),
    "minibatch_size" : 32,
    "output_folder" : model.__class__.__name__
}
MyTrainer(**param_dict).run()

Average reward after 100 training runs : 0.03
Average reward after 200 training runs : 0.04
Average reward after 300 training runs : 0.05
Average reward after 400 training runs : 0.02
Average reward after 500 training runs : 0.05
Average reward after 600 training runs : 0.0
Average reward after 700 training runs : 0.03
Average reward after 800 training runs : 0.0
Average reward after 900 training runs : 0.03
Average reward after 1000 training runs : 0.03
Average reward after 1000 runs with optimal strategy : 0.0
