## Setup

## CartPole

In [1]:
import gym
env = gym.make('CartPole-v0')

#### Observations

In [2]:
env.observation_space

Box(4,)

In [3]:
env.reset()

array([-0.02351904,  0.04558041, -0.01750592,  0.01561808])

#### Actions

In [4]:
env.action_space

Discrete(2)

In [5]:
env.action_space.sample()

0

#### Steps

In [6]:
observation, reward, done, info = env.step(env.action_space.sample())

### Running

In [7]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

[-0.0389074  -0.00936086 -0.03871528  0.02507888]
[-0.03909462  0.1862943  -0.03821371 -0.27956339]
[-0.03536873 -0.00826228 -0.04380497  0.00082633]
[-0.03553398  0.18745963 -0.04378845 -0.30534945]
[-0.03178479  0.38317734 -0.04989544 -0.6115143 ]
[-0.02412124  0.57895985 -0.06212572 -0.91948575]
[-0.01254204  0.77486402 -0.08051544 -1.23102832]
[ 0.00295524  0.97092399 -0.105136   -1.5478115 ]
[ 0.02237372  1.16713926 -0.13609223 -1.87136232]
[ 0.0457165   1.36346101 -0.17351948 -2.20300981]
Episode finished after 10 timesteps
[ 0.02487422 -0.04051463  0.02235409  0.03932106]
[ 0.02406393 -0.23594988  0.02314051  0.33897225]
[ 0.01934493 -0.04116472  0.02991995  0.05367543]
[ 0.01852164 -0.23670262  0.03099346  0.35564617]
[ 0.01378758 -0.43225121  0.03810639  0.65793881]
[ 0.00514256 -0.62788227  0.05126516  0.96237293]
[-0.00741509 -0.82365424  0.07051262  1.27071044]
[-0.02388817 -1.01960194  0.09592683  1.58461513]
[-0.04428021 -0.82574278  0.12761913  1.32332254]
[-0.06079506 -

## Baseline!

In [8]:
import gym
import itertools
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers

import baselines.common.tf_util as U

from baselines import logger
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule

  from ._conv import register_converters as _register_converters


#### Model Definition

In [9]:
def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
    return out

In [10]:
act, train, update_target, debug = deepq.build_train(
    make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
    q_func=model,
    num_actions=env.action_space.n,
    optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
)



#### Replay Buffer

In [11]:
replay_buffer = ReplayBuffer(50000)

Create the schedule for exploration starting from 1 (every action is random) down to 0.02 (98% of actions are selected according to values predicted by the model).

In [12]:
exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

In [None]:
episode_rewards = [0.0]
obs = env.reset()

In [None]:
with U.make_session(8):
    U.initialize()
    update_target()
    for t in itertools.count():
        # Take action and update exploration to the newest value
        action = act(obs[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0)

        is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 170
        if is_solved:
            # Show off the result
            env.render()
        else:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if t > 1000:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
            # Update target network periodically.
            if t % 1000 == 0:
                update_target()

        if done and len(episode_rewards) % 10 == 0:
            print("steps: ", t)
            print("episodes: ", len(episode_rewards))
            print("mean episode reward: ", round(np.mean(episode_rewards[-101:-1]), 1))
            print("% time spent exploring: ", int(100 * exploration.value(t)))
            print("==============================")

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


steps:  189
episodes:  10
mean episode reward:  21.1
% time spent exploring:  98
steps:  381
episodes:  20
mean episode reward:  20.1
% time spent exploring:  96
steps:  568
episodes:  30
mean episode reward:  19.6
% time spent exploring:  94
steps:  769
episodes:  40
mean episode reward:  19.7
% time spent exploring:  92
steps:  948
episodes:  50
mean episode reward:  19.4
% time spent exploring:  90
steps:  1159
episodes:  60
mean episode reward:  19.7
% time spent exploring:  88
steps:  1352
episodes:  70
mean episode reward:  19.6
% time spent exploring:  86
steps:  1499
episodes:  80
mean episode reward:  19.0
% time spent exploring:  85
steps:  1672
episodes:  90
mean episode reward:  18.8
% time spent exploring:  83
steps:  1860
episodes:  100
mean episode reward:  18.8
% time spent exploring:  81
steps:  2033
episodes:  110
mean episode reward:  18.4
% time spent exploring:  80
steps:  2237
episodes:  120
mean episode reward:  18.6
% time spent exploring:  78
steps:  2407
episo