## Setup

## CartPole

In [None]:
import gym
env = gym.make('CartPole-v0')

#### Observations

In [None]:
env.observation_space

In [None]:
env.reset()

#### Actions

In [None]:
env.action_space

In [None]:
env.action_space.sample()

#### Steps

In [None]:
observation, reward, done, info = env.step(env.action_space.sample())

### Running

In [None]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

## Baseline!

In [None]:
import gym
import itertools
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers

import baselines.common.tf_util as U

from baselines import logger
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule

#### Model Definition

In [None]:
def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
    return out

In [None]:
act, train, update_target, debug = deepq.build_train(
    make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
    q_func=model,
    num_actions=env.action_space.n,
    optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
)

#### Replay Buffer

In [None]:
replay_buffer = ReplayBuffer(50000)

Create the schedule for exploration starting from 1 (every action is random) down to 0.02 (98% of actions are selected according to values predicted by the model).

In [None]:
exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

In [None]:
episode_rewards = [0.0]
obs = env.reset()

In [None]:
with U.make_session(8):
    U.initialize()
    update_target()
    for t in itertools.count():
        # Take action and update exploration to the newest value
        action = act(obs[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0)

        is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 150
        if is_solved:
            # Show off the result
            env.render()
        else:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if t > 1000:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
            # Update target network periodically.
            if t % 1000 == 0:
                update_target()

        if done and len(episode_rewards) % 10 == 0:
            print("steps: ", t)
            print("episodes: ", len(episode_rewards))
            print("mean episode reward: ", round(np.mean(episode_rewards[-101:-1]), 1))
            print("% time spent exploring: ", int(100 * exploration.value(t)))
            print("==============================")

https://github.com/AdrianP-/gym_trading

In [None]:
from gym import envs
envids = [spec.id for spec in envs.registry.all()]
for envid in sorted(envids):
    print(envid)