In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
sys.path.append(os.getcwd() + '/..')
import src

# Experiment setup
* Here we basically just load the game and check that it works

In [3]:
from __future__ import print_function 
import matplotlib.pyplot as plt
import numpy as np
import gym
from tqdm import tqdm


%matplotlib inline
%env THEANO_FLAGS="floatX=float32"

env: THEANO_FLAGS="floatX=float32"


In [4]:
GAME = "FrontoPolarStocks-v0"

N_AGENTS = 1
SEQ_LENGTH = 10

In [5]:
env = gym.make(GAME)
env.reset()

action_shape = (env.action_space.num_discrete_space,)
action_emb_shape = (*action_shape, 3)
state_shape = env.observation_space.shape

state, _, _, _ = env.step([0] * action_shape[0])

action_names = np.array(["sell", "pass", "buy"]) #i guess so... i may be wrong

print(state)

[2017-05-21 23:18:40,518] Making new env: FrontoPolarStocks-v0


[ 118.809998   54.759999   42.060001   73.639999   43.759998   42.009998
   37.209999   73.410004   27.5        26.040001   51.43       47.880001
   13.9        20.879999]


# Basic agent setup
Here we define a simple agent that maps game images into Qvalues using shallow neural network.


In [6]:
import theano
from theano import tensor as T
import lasagne

theano.config.exception_verbosity = 'high'
theano.config.optimizer = 'fast_compile'

In [7]:
from agent.agent import build_agent

In [8]:
agent, action_layer, V_layer = build_agent(action_emb_shape, state_shape)

In [9]:
agent.action_layers[0].output_shape

(None, 14)

In [10]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params((action_layer,V_layer),trainable=True)

# Create and manage a pool of atari sessions to play with

* To make training more stable, we shall have an entire batch of game sessions each happening independent of others
* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf
* Alternative approach: store more sessions: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

In [11]:
from agentnet.experiments.openai_gym.pool import EnvPool

pool = EnvPool(agent, GAME, N_AGENTS, max_size=10000)

[2017-05-21 23:18:42,002] Making new env: FrontoPolarStocks-v0


In [12]:
%%time
#interact for 7 ticks
_,action_log,reward_log,_,_,_  = pool.interact(7)


print(action_names[action_log])
print(reward_log)

[[['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy' 'buy'
   'buy' 'buy']
  ['pass' 'buy' 'pass' 'pass' 'pass' 'pass' 'pass' 'buy' 'buy' 'pass' 'buy'
   'pass' 'pass' 'buy']]]
[[ -3.37671498   2.06675799  -6.10783801   4.87881603  26.68723498
   19.29183601   0.        ]]
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 5.98 ms


In [13]:
#load first sessions (this function calls interact and remembers sessions)
pool.update(SEQ_LENGTH)

# a2c loss

Here we define obective function for actor-critic (one-step) RL.

* We regularize policy with expected inverse action probabilities (discouraging very small probas) to make objective numerically stable


In [14]:
#get agent's Qvalues obtained via experience replay
replay = pool.experience_replay.sample_session_batch(100, replace=True)

_,_,_,_,(policy_seq,V_seq) = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,
)

In [15]:
#get reference Qvalues according to Qlearning algorithm
from agentnet.learning import a2c


#Train via actor-critic (see here - https://www.youtube.com/watch?v=KHZVXao4qXs)

### COMMENTED ASSERT ACTION.DIM==2 LINE IN THIS METHOD
elwise_mse_loss = a2c.get_elementwise_objective(
    policy_seq,
    V_seq[:,:,0],
    replay.actions[0],
    replay.rewards,
    replay.is_alive,
    gamma_or_gammas=0.99)

#compute mean over "alive" fragments
loss = elwise_mse_loss.sum() / replay.is_alive.sum()

In [16]:

reg = T.mean((1. / policy_seq).sum(axis=-1))
loss += 0.001 * reg

In [17]:
# Compute weight updates
updates = lasagne.updates.rmsprop(loss, weights, learning_rate=0.001)

In [18]:
#compile train function

train_step = theano.function([],loss,updates=updates)

# Demo run

In [19]:
#for MountainCar-v0 evaluation session is cropped to 200 ticks
untrained_reward = pool.evaluate(save_path="./records",record_video=True)

[2017-05-21 23:18:44,920] Making new env: FrontoPolarStocks-v0
[2017-05-21 23:18:45,143] Clearing 4 monitor files from previous run (because force=True was provided)
[2017-05-21 23:18:45,145] Starting new video recorder writing to /home/manatee/Desktop/inn.prac/Algorithms/src/records/openaigym.video.0.4716.video000000.mp4
[2017-05-21 23:18:52,061] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/manatee/Desktop/inn.prac/Algorithms/src/records')


Episode finished after 400 timesteps with reward=-32063.866766078016


In [20]:
from IPython.display import HTML

video_path="records/openaigym.video.0.15895.video000000.mp4"

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format(video_path))

# Vime

In [21]:
from agent.bnn import BNN
bnn = BNN(state_shape, action_shape, action_emb_shape, replay)

(None, 14, 3)
(None, 14)
(None, 42)


# Training loop

In [22]:
#starting epoch
epoch_counter = 1

#full game rewards
rewards = {epoch_counter: untrained_reward}

In [23]:
#pre-fill pool
for i in tqdm(range(1000)):
    pool.update(SEQ_LENGTH, append=True) #, preprocess=bnn.add_vime_reward)

100%|██████████| 1000/1000 [00:03<00:00, 323.31it/s]


In [24]:
#pre-train BNN (mitigate training lag on first iterations where BNN is stupid)

# for i in tqdm(range(1000)):
#     bnn.train_step(*bnn.sample_from_pool())

In [27]:
#the loop may take eons to finish.
#consider interrupting early.
loss = 0
for i in tqdm(range(1000)):    
    
    
    #train
    for i in range(10):
        pool.update(SEQ_LENGTH, append=True) #, preprocess=bnn.add_vime_reward)

    for i in range(10):
        new_loss = 0# train_step()
        loss = loss * 0.99 + new_loss * 0.01
    
#     for i in range(10):
#         bnn.train_step(*bnn.sample_from_pool())

    if epoch_counter%100==0:
        #average reward per game tick in current experience replay pool
        pool_mean_reward = np.average(pool.experience_replay.rewards.get_value()[:,:-1],
                                      weights=1+pool.experience_replay.is_alive.get_value()[:,:-1])
        pool_size = pool.experience_replay.rewards.get_value().shape[0]
        print("iter=%i\treward/step=%.5f\tpool_size=%i\tvime ma=%.5f"%(epoch_counter,
                                                         pool_mean_reward,
                                                         pool_size,
                                                         -1))#bnn.vime_reward_ma))
        

    ##record current learning progress and show learning curves
    if epoch_counter%500 ==0:
        n_games = 10
        rewards[epoch_counter] = pool.evaluate(
            record_video=False,
            n_games=n_games,
            verbose=False)
        print("Current score(mean over %i) = %.3f"%(n_games,np.mean(rewards[epoch_counter])))
    
    
    epoch_counter  +=1

    
# Time to drink some coffee!

 10%|█         | 102/1000 [00:04<00:43, 20.61it/s]

iter=100	reward/step=20.55884	pool_size=2021	vime ma=-1.00000


 20%|██        | 202/1000 [00:10<00:47, 16.70it/s]

iter=200	reward/step=16.33468	pool_size=3021	vime ma=-1.00000


 30%|███       | 302/1000 [00:17<00:52, 13.19it/s]

iter=300	reward/step=13.60817	pool_size=4021	vime ma=-1.00000


 40%|████      | 402/1000 [00:25<00:48, 12.39it/s]

iter=400	reward/step=18.02586	pool_size=5021	vime ma=-1.00000


 50%|████▉     | 499/1000 [00:34<00:43, 11.56it/s][2017-05-21 23:27:35,295] Making new env: FrontoPolarStocks-v0
[2017-05-21 23:27:35,489] Clearing 4 monitor files from previous run (because force=True was provided)


iter=500	reward/step=15.44911	pool_size=6021	vime ma=-1.00000


[2017-05-21 23:27:36,624] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/manatee/Desktop/inn.prac/Algorithms/src/records')
 50%|█████     | 501/1000 [00:36<02:23,  3.47it/s]

Current score(mean over 10) = -3879.314


 60%|██████    | 600/1000 [00:45<00:38, 10.49it/s]

iter=600	reward/step=17.90628	pool_size=7021	vime ma=-1.00000


 70%|███████   | 701/1000 [00:56<00:32,  9.14it/s]

iter=700	reward/step=17.04039	pool_size=8021	vime ma=-1.00000


 80%|████████  | 801/1000 [01:08<00:23,  8.44it/s]

iter=800	reward/step=17.68815	pool_size=9021	vime ma=-1.00000


 90%|█████████ | 901/1000 [01:21<00:12,  7.71it/s]

iter=900	reward/step=20.20611	pool_size=10000	vime ma=-1.00000


100%|█████████▉| 999/1000 [01:34<00:00,  7.84it/s][2017-05-21 23:28:35,014] Making new env: FrontoPolarStocks-v0
[2017-05-21 23:28:35,213] Clearing 2 monitor files from previous run (because force=True was provided)


iter=1000	reward/step=21.76848	pool_size=10000	vime ma=-1.00000


[2017-05-21 23:28:36,090] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/manatee/Desktop/inn.prac/Algorithms/src/records')
100%|██████████| 1000/1000 [01:35<00:00,  2.21it/s]

Current score(mean over 10) = 3205.979





In [None]:
iters, session_rewards=zip(*sorted(rewards.items(),key=lambda pr:pr[0]))

In [None]:
plt.plot(iters,list(map(np.mean, session_rewards)))
plt.title("Training progress")
plt.xlabel("Epoch counter")
plt.ylabel("Mean Income")
plt.show()

In [None]:
_,_,_,_,(pool_policy,pool_V) = agent.get_sessions(
    pool.experience_replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,)

states = pool.experience_replay.observations[0].get_value().reshape((-1, 5)).T[-2:]
values = pool_V.ravel().eval()
optimal_actid = pool_policy.argmax(-1).ravel().eval()

In [None]:
plt.scatter(
    *states,
    c=values,
    alpha = 0.1)
plt.title("predicted state values")
plt.xlabel("previous")
plt.ylabel("current")
plt.show()

In [None]:
obs_x, obs_y = states

for i in range(3):
    sel = optimal_actid==i
    plt.scatter(obs_x[sel],obs_y[sel],
                c=['red','blue','green'][i],
                alpha = 0.1,label=action_names[i])
    
plt.title("most likely action id")
plt.xlabel("previous")
plt.ylabel("current")
plt.legend(loc='best')
plt.show()

In [None]:
#for MountainCar-v0 evaluation session is cropped to 200 ticks
untrained_reward = pool.evaluate(save_path="./records",record_video=True)