In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
sys.path.append(os.getcwd() + '/..')
import src

# Experiment setup
* Here we basically just load the game and check that it works

In [3]:
from __future__ import print_function 
import matplotlib.pyplot as plt
import numpy as np
import gym
from tqdm import tqdm

%matplotlib inline
%env THEANO_FLAGS="floatX=float32"

env: THEANO_FLAGS="floatX=float32"


In [4]:
GAME = "FrontoPolarStocks-v0"

N_AGENTS = 2
SEQ_LENGTH = 13

In [5]:
env = gym.make(GAME)
env.reset()

action_shape = (env.action_space.num_discrete_space,)
action_emb_shape = (*action_shape, 3)
state_shape = env.observation_space.shape

state, _, _, _ = env.step([0] * action_shape[0])

action_names = np.array(["sell", "pass", "buy"]) #i guess so... i may be wrong

print(state)

[2017-05-23 02:55:10,990] Making new env: FrontoPolarStocks-v0


[ 259.850006   21.959999   42.860001   29.23       16.99       24.1
   20.110001   37.25       24.459999   12.93       14.26       25.33
    9.66        9.48    ]


# Basic agent setup
Here we define a simple agent that maps game images into Qvalues using shallow neural network.


In [6]:
import theano
from theano import tensor as T
import lasagne

theano.config.exception_verbosity = 'high'
theano.config.optimizer = 'fast_compile'

In [7]:
from agent.agent import build_agent

In [8]:
agent, action_layer, V_layer = build_agent(action_emb_shape, state_shape)

In [9]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params((action_layer,V_layer),trainable=True)

# Create and manage a pool of atari sessions to play with

* To make training more stable, we shall have an entire batch of game sessions each happening independent of others
* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf
* Alternative approach: store more sessions: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

In [10]:
from agentnet.experiments.openai_gym.pool import EnvPool

pool = EnvPool(agent, GAME, N_AGENTS, max_size=10000)

[2017-05-23 02:55:13,116] Making new env: FrontoPolarStocks-v0
[2017-05-23 02:55:13,393] Making new env: FrontoPolarStocks-v0


In [11]:
%%time
#interact for 7 ticks
_,action_log,reward_log,_,_,_  = pool.interact(7)


print(action_names[action_log])
print(reward_log)

[[['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['pass' 'buy' 'pass' 'pass' 'pass' 'pass' 'pass' 'buy' 'buy' 'pass' 'buy'
   'pass' 'pass' 'buy']]

 [['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'pass' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   

In [12]:
#load first sessions (this function calls interact and remembers sessions)
pool.update(SEQ_LENGTH)

# a2c loss

Here we define obective function for actor-critic (one-step) RL.

* We regularize policy with expected inverse action probabilities (discouraging very small probas) to make objective numerically stable


In [13]:
replay = pool.experience_replay.sample_session_batch(100, replace=True)

_,_,_,_,(policy_seq,V_seq) = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,
)

In [16]:
from agentnet.learning import a2c


#Train via actor-critic (see here - https://www.youtube.com/watch?v=KHZVXao4qXs)

### COMMENTED ASSERT ACTION.DIM==2 LINE IN THIS METHOD
elwise_mse_loss = a2c.get_elementwise_objective(
    policy_seq,
    V_seq[:,:,0],
    replay.actions[0],
    replay.rewards,
    replay.is_alive,
    gamma_or_gammas=0.99)

#compute mean over "alive" fragments
loss = elwise_mse_loss.sum() / replay.is_alive.sum()

In [17]:
reg = T.mean((1. / policy_seq).sum(axis=-1))
loss += 0.001 * reg

In [18]:
# Compute weight updates
updates = lasagne.updates.rmsprop(loss, weights, learning_rate=0.001)

In [19]:
#compile train function

train_step = theano.function([], loss, updates=updates)

# Demo run

In [20]:
#for MountainCar-v0 evaluation session is cropped to 200 ticks
untrained_reward = pool.evaluate(save_path="./records",record_video=True)

[2017-05-23 02:55:17,716] Making new env: FrontoPolarStocks-v0
[2017-05-23 02:55:17,949] Clearing 4 monitor files from previous run (because force=True was provided)
[2017-05-23 02:55:17,951] Starting new video recorder writing to /home/manatee/Desktop/inn.prac/Algorithms/src/records/openaigym.video.0.22897.video000000.mp4
[2017-05-23 02:55:23,470] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/manatee/Desktop/inn.prac/Algorithms/src/records')


Episode finished after 313 timesteps with reward=48001.23716385198


In [21]:
from IPython.display import HTML

video_path="records/openaigym.video.0.15895.video000000.mp4"

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format(video_path))

# Training loop

In [22]:
#starting epoch
epoch_counter = 1

#full game rewards
rewards = {epoch_counter: untrained_reward}

In [23]:
#pre-fill pool
for i in tqdm(range(1000)):
    pool.update(SEQ_LENGTH, append=True) #, preprocess=bnn.add_vime_reward)

100%|██████████| 1000/1000 [00:06<00:00, 165.23it/s]


In [24]:
#the loop may take eons to finish.
#consider interrupting early.
loss = 0
for i in tqdm(range(1000)):    
    
    
    #train
    for i in range(10):
        pool.update(SEQ_LENGTH, append=True) #, preprocess=bnn.add_vime_reward)

    for i in range(10):
        new_loss = train_step()
        loss = loss * 0.99 + new_loss * 0.01
    
#     for i in range(10):
#         bnn.train_step(*bnn.sample_from_pool())

    if epoch_counter%100==0:
        #average reward per game tick in current experience replay pool
        pool_mean_reward = np.average(pool.experience_replay.rewards.get_value()[:,:-1],
                                      weights=1+pool.experience_replay.is_alive.get_value()[:,:-1])
        pool_size = pool.experience_replay.rewards.get_value().shape[0]
        print("iter=%i\treward/step=%.5f\tpool_size=%i\tvime ma=%.5f"%(epoch_counter,
                                                         pool_mean_reward,
                                                         pool_size,
                                                         -1))#bnn.vime_reward_ma))
        

    ##record current learning progress and show learning curves
    if epoch_counter%500 ==0:
        n_games = 10
        rewards[epoch_counter] = pool.evaluate(
            record_video=False,
            n_games=n_games,
            verbose=False)
        print("Current score(mean over %i) = %.3f"%(n_games,np.mean(rewards[epoch_counter])))
    
    
    epoch_counter  +=1

    
# Time to drink some coffee!

  0%|          | 0/1000 [00:00<?, ?it/s]


IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (100,1) (1,13) (100,13,14) 
Apply node that caused the error: AdvancedSubtensor(Elemwise{log,no_inplace}.0, InplaceDimShuffle{0,x}.0, InplaceDimShuffle{x,0}.0, AdvancedSubtensor1.0)
Toposort index: 326
Inputs types: [TensorType(float32, 3D), TensorType(int64, col), TensorType(int64, row), TensorType(int32, 3D)]
Inputs shapes: [(100, 13, 42), (100, 1), (1, 13), (100, 13, 14)]
Inputs strides: [(2184, 168, 4), (8, 8), (104, 8), (728, 56, 4)]
Inputs values: ['not shown', 'not shown', 'not shown', 'not shown']
Inputs type_num: [11, 7, 7, 5]
Outputs clients: [[Elemwise{neg,no_inplace}(AdvancedSubtensor.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "/home/manatee/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/manatee/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/manatee/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/manatee/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/manatee/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-bba4d0ea15d1>", line 13, in <module>
    gamma_or_gammas=0.99)
  File "/home/manatee/anaconda3/lib/python3.6/site-packages/agentnet/learning/a2c.py", line 140, in get_elementwise_objective
    action_logprobas = get_values_for_actions(logpolicy, actions) if logpolicy.ndim == 3 else logpolicy
  File "/home/manatee/anaconda3/lib/python3.6/site-packages/agentnet/learning/generic.py", line 265, in get_values_for_actions
    action_values_predicted = values_for_all_actions[batch_i, time_i, actions]

Debugprint of the apply node: 
AdvancedSubtensor [id A] <TensorType(float32, 3D)> ''   
 |Elemwise{log,no_inplace} [id B] <TensorType(float32, 3D)> ''   
 | |InplaceDimShuffle{1,0,2} [id C] <TensorType(float32, 3D)> ''   
 |   |Join [id D] <TensorType(float32, 3D)> ''   
 |     |TensorConstant{0} [id E] <TensorType(int8, scalar)>
 |     |InplaceDimShuffle{x,0,1} [id F] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id G] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id H] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id I] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id K] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id L] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id M] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id N] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | | |AdvancedSubtensor1 [id P] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |   |sessions.observations_history.0 [id Q] <TensorType(float32, 3D)>
 |     |   | |   | | | |   |RandomFunction{choice_helper}.1 [id R] <TensorType(int32, vector)> ''   
 |     |   | |   | | | |     |<RandomStateType> [id S] <RandomStateType>
 |     |   | |   | | | |     |TensorConstant{(1,) of 100} [id T] <TensorType(int64, vector)>
 |     |   | |   | | | |     |Subtensor{int64} [id U] <TensorType(int64, scalar)> ''   
 |     |   | |   | | | |     | |Shape [id V] <TensorType(int64, vector)> ''   
 |     |   | |   | | | |     | | |session.rewards_history [id W] <TensorType(float32, matrix)>
 |     |   | |   | | | |     | |Constant{0} [id X] <int64>
 |     |   | |   | | | |     |TensorConstant{1} [id Y] <TensorType(int8, scalar)>
 |     |   | |   | | | |     |TensorConstant{[]} [id Z] <TensorType(float32, vector)>
 |     |   | |   | | | |Constant{0} [id X] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |   |dense1.b [id BC] <TensorType(float32, vector)>
 |     |   | |   |Elemwise{abs_,no_inplace} [id BD] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id L] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id BG] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id BH] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id BI] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id BJ] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id BK] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id BL] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id BM] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id BN] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{1} [id BO] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id BP] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id BL] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id BQ] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id BR] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id BS] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id BT] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id BU] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id BV] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id BW] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id BX] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{2} [id BY] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id BZ] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id BV] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id CA] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id CB] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id CC] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id CD] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id CE] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id CF] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id CG] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id CH] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{3} [id CI] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id CJ] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id CF] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id CK] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id CL] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id CM] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id CN] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id CO] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id CP] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id CQ] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id CR] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{4} [id CS] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id CT] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id CP] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id CU] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id CV] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id CW] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id CX] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id CY] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id CZ] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id DA] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id DB] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{5} [id DC] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id DD] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id CZ] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id DE] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id DF] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id DG] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id DH] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id DI] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id DJ] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id DK] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id DL] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{6} [id DM] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id DN] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id DJ] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id DO] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id DP] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id DQ] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id DR] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id DS] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id DT] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id DU] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id DV] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{7} [id DW] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id DX] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id DT] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id DY] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id DZ] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id EA] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id EB] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id EC] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id ED] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id EE] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id EF] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{8} [id EG] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id EH] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id ED] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id EI] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id EJ] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id EK] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id EL] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id EM] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id EN] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id EO] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id EP] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{9} [id EQ] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id ER] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id EN] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id ES] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id ET] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id EU] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id EV] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id EW] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id EX] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id EY] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id EZ] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{10} [id FA] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id FB] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id EX] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id FC] <TensorType(float32, (True, False, False))> ''   
 |     | |SoftmaxWithBias [id FD] <TensorType(float32, matrix)> ''   
 |     |   |Dot22 [id FE] <TensorType(float32, matrix)> ''   
 |     |   | |Elemwise{mul,no_inplace} [id FF] <TensorType(float32, matrix)> ''   
 |     |   | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |     |   | | |Elemwise{add,no_inplace} [id FG] <TensorType(float32, matrix)> ''   
 |     |   | |   |Elemwise{add,no_inplace} [id FH] <TensorType(float32, matrix)> ''   
 |     |   | |   | |Dot22 [id FI] <TensorType(float32, matrix)> ''   
 |     |   | |   | | |Subtensor{int64} [id FJ] <TensorType(float32, matrix)> ''   
 |     |   | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |     |   | |   | | | |Constant{11} [id FK] <int64>
 |     |   | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |     |   | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |     |   | |   |Elemwise{abs_,no_inplace} [id FL] <TensorType(float32, matrix)> ''   
 |     |   | |     |Elemwise{add,no_inplace} [id FH] <TensorType(float32, matrix)> ''   
 |     |   | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |     |   |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |     |InplaceDimShuffle{x,0,1} [id FM] <TensorType(float32, (True, False, False))> ''   
 |       |SoftmaxWithBias [id FN] <TensorType(float32, matrix)> ''   
 |         |Dot22 [id FO] <TensorType(float32, matrix)> ''   
 |         | |Elemwise{mul,no_inplace} [id FP] <TensorType(float32, matrix)> ''   
 |         | | |TensorConstant{(1, 1) of 0.5} [id J] <TensorType(float32, (True, True))>
 |         | | |Elemwise{add,no_inplace} [id FQ] <TensorType(float32, matrix)> ''   
 |         | |   |Elemwise{add,no_inplace} [id FR] <TensorType(float32, matrix)> ''   
 |         | |   | |Dot22 [id FS] <TensorType(float32, matrix)> ''   
 |         | |   | | |Subtensor{int64} [id FT] <TensorType(float32, matrix)> ''   
 |         | |   | | | |InplaceDimShuffle{1,0,2} [id O] <TensorType(float32, 3D)> ''   
 |         | |   | | | |Constant{12} [id FU] <int64>
 |         | |   | | |dense1.W [id BA] <TensorType(float32, matrix)>
 |         | |   | |InplaceDimShuffle{x,0} [id BB] <TensorType(float32, row)> ''   
 |         | |   |Elemwise{abs_,no_inplace} [id FV] <TensorType(float32, matrix)> ''   
 |         | |     |Elemwise{add,no_inplace} [id FR] <TensorType(float32, matrix)> ''   
 |         | |q-evaluator layer.W [id BE] <TensorType(float32, matrix)>
 |         |q-evaluator layer.b [id BF] <TensorType(float32, vector)>
 |InplaceDimShuffle{0,x} [id FW] <TensorType(int64, col)> ''   
 | |ARange{dtype='int64'} [id FX] <TensorType(int64, vector)> ''   
 |   |TensorConstant{0} [id E] <TensorType(int8, scalar)>
 |   |Subtensor{int64} [id FY] <TensorType(int64, scalar)> ''   
 |   | |Shape [id FZ] <TensorType(int64, vector)> ''   
 |   | | |Elemwise{log,no_inplace} [id B] <TensorType(float32, 3D)> ''   
 |   | |Constant{0} [id X] <int64>
 |   |TensorConstant{1} [id Y] <TensorType(int8, scalar)>
 |InplaceDimShuffle{x,0} [id GA] <TensorType(int64, row)> ''   
 | |ARange{dtype='int64'} [id GB] <TensorType(int64, vector)> ''   
 |   |TensorConstant{0} [id E] <TensorType(int8, scalar)>
 |   |Subtensor{int64} [id GC] <TensorType(int64, scalar)> ''   
 |   | |Shape [id FZ] <TensorType(int64, vector)> ''   
 |   | |Constant{1} [id BO] <int64>
 |   |TensorConstant{1} [id Y] <TensorType(int8, scalar)>
 |AdvancedSubtensor1 [id GD] <TensorType(int32, 3D)> ''   
   |session.actions_history.0 [id GE] <TensorType(int32, 3D)>
   |RandomFunction{choice_helper}.1 [id R] <TensorType(int32, vector)> ''   

Storage map footprint:
 - sessions.observations_history.0, Shared Input, Shape: (2022, 13, 14), ElemSize: 4 Byte(s), TotalSize: 1472016 Byte(s)
 - session.actions_history.0, Shared Input, Shape: (2022, 13, 14), ElemSize: 4 Byte(s), TotalSize: 1472016 Byte(s)
 - InplaceDimShuffle{1,0,2}.0, Shape: (100, 13, 42), ElemSize: 4 Byte(s), TotalSize: 218400 Byte(s)
 - Elemwise{log,no_inplace}.0, Shape: (100, 13, 42), ElemSize: 4 Byte(s), TotalSize: 218400 Byte(s)
 - session.rewards_history, Shared Input, Shape: (2022, 13), ElemSize: 4 Byte(s), TotalSize: 105144 Byte(s)
 - AdvancedSubtensor1.0, Shape: (100, 13, 14), ElemSize: 4 Byte(s), TotalSize: 72800 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{add,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - Elemwise{mul,no_inplace}.0, Shape: (100, 100), ElemSize: 4 Byte(s), TotalSize: 40000 Byte(s)
 - session.is_alive, Shared Input, Shape: (2022, 13), ElemSize: 1 Byte(s), TotalSize: 26286 Byte(s)
 - q-evaluator layer.W, Shared Input, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - <TensorType(float32, matrix)>, Shared Input, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - SoftmaxWithBias.0, Shape: (100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - InplaceDimShuffle{x,0,1}.0, Shape: (1, 100, 42), ElemSize: 4 Byte(s), TotalSize: 16800 Byte(s)
 - dense1.W, Shared Input, Shape: (14, 100), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - <TensorType(float32, matrix)>, Shared Input, Shape: (14, 100), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - Subtensor{int64}.0, Shape: (100, 14), ElemSize: 4 Byte(s), TotalSize: 5600 Byte(s)
 - InplaceDimShuffle{0,x}.0, Shape: (100, 1), ElemSize: 8 Byte(s), TotalSize: 800 Byte(s)
 - dense1.b, Shared Input, Shape: (100,), ElemSize: 4 Byte(s), TotalSize: 400 Byte(s)
 - state values.W, Shared Input, Shape: (100, 1), ElemSize: 4 Byte(s), TotalSize: 400 Byte(s)
 - <TensorType(float32, vector)>, Shared Input, Shape: (100,), ElemSize: 4 Byte(s), TotalSize: 400 Byte(s)
 - <TensorType(float32, col)>, Shared Input, Shape: (100, 1), ElemSize: 4 Byte(s), TotalSize: 400 Byte(s)
 - RandomFunction{choice_helper}.1, Shape: (100,), ElemSize: 4 Byte(s), TotalSize: 400 Byte(s)
 - q-evaluator layer.b, Shared Input, Shape: (42,), ElemSize: 4 Byte(s), TotalSize: 168 Byte(s)
 - <TensorType(float32, vector)>, Shared Input, Shape: (42,), ElemSize: 4 Byte(s), TotalSize: 168 Byte(s)
 - InplaceDimShuffle{x,0}.0, Shape: (1, 13), ElemSize: 8 Byte(s), TotalSize: 104 Byte(s)
 - Constant{0}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - TensorConstant{(1,) of 100}, Shape: (1,), ElemSize: 8 Byte(s), TotalSize: 8 Byte(s)
 - Constant{12}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{11}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{10}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{9}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{8}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{7}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{6}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{5}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{4}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{3}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{2}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{1}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{-1}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - TensorConstant{1.0}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - state values.b, Shared Input, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - <TensorType(float32, (True,))>, Shared Input, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1,) of 0.9}, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1) of 0.9}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1) of 0.5}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1, 1) of 1.0}, Shape: (1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1, 1) of 0.0}, Shape: (1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1) of 0.0}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{0.00100000..0474974513}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
 - TensorConstant{(1, 1) of 0.99}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1) of 0.001}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1) of 0.1}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1,) of 0.001}, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1,) of 0.1}, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1, 1) of 1e-06}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{(1,) of 1e-06}, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{1}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
 - TensorConstant{(1, 1) of 1}, Shape: (1, 1), ElemSize: 1 Byte(s), TotalSize: 1 Byte(s)
 - TensorConstant{False}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
 - TensorConstant{0}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
 - Constant{0}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
 - TensorConstant{(1, 1) of 2}, Shape: (1, 1), ElemSize: 1 Byte(s), TotalSize: 1 Byte(s)
 - TensorConstant{(1,) of 2}, Shape: (1,), ElemSize: 1 Byte(s), TotalSize: 1 Byte(s)
 - TensorConstant{[]}, Shape: (0,), ElemSize: 4 Byte(s), TotalSize: 0 Byte(s)
 TotalSize: 4964501.0 Byte(s) 0.005 GB
 TotalSize inputs: 3122397.0 Byte(s) 0.003 GB



In [None]:
iters, session_rewards=zip(*sorted(rewards.items(),key=lambda pr:pr[0]))

In [None]:
plt.plot(iters,list(map(np.mean, session_rewards)))
plt.title("Training progress")
plt.xlabel("Epoch counter")
plt.ylabel("Mean Income")
plt.show()

In [None]:
_,_,_,_,(pool_policy,pool_V) = agent.get_sessions(
    pool.experience_replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,)

states = pool.experience_replay.observations[0].get_value().reshape((-1, 5)).T[-2:]
values = pool_V.ravel().eval()
optimal_actid = pool_policy.argmax(-1).ravel().eval()

In [None]:
plt.scatter(
    *states,
    c=values,
    alpha = 0.1)
plt.title("predicted state values")
plt.xlabel("previous")
plt.ylabel("current")
plt.show()

In [None]:
obs_x, obs_y = states

for i in range(3):
    sel = optimal_actid==i
    plt.scatter(obs_x[sel],obs_y[sel],
                c=['red','blue','green'][i],
                alpha = 0.1,label=action_names[i])
    
plt.title("most likely action id")
plt.xlabel("previous")
plt.ylabel("current")
plt.legend(loc='best')
plt.show()

In [None]:
#for MountainCar-v0 evaluation session is cropped to 200 ticks
untrained_reward = pool.evaluate(save_path="./records",record_video=True)