In [1]:
import numpy as np
import matplotlib.pyplot as plt
from dm_control import suite, viewer
from tqdm import tqdm
from ddpg import DDPGagent
from utils import *

In [2]:
random_state = np.random.RandomState(42)

PATH_MODEL = 'ddpg_actor.pt'
NUM_EPISODES = 50000
NUM_WARMUP = 10000
BATCH_SIZE = 128
DURATION = 5
ACTOR_LEARNING_RATE=1e-4
CRITIC_LEARNING_RATE=1e-3
GAMMA=0.99
TAU=1e-2

env = suite.load('ball_in_cup', 'catch', task_kwargs={'random': random_state})
# env = suite.load('cartpole', 'balance', task_kwargs={'random': random_state})
action_spec = env.action_spec()
obs_spec = env.observation_spec()
dim_action = action_spec.shape[0]
dim_obs = sum(tuple(map(lambda x: x.shape[0], obs_spec.values())))

agent = DDPGagent(
    dim_obs,
    dim_action,
    actor_learning_rate=ACTOR_LEARNING_RATE,
    critic_learning_rate=CRITIC_LEARNING_RATE,
    gamma=GAMMA,
    tau=TAU
)
noise = OUNoise(dim_action, action_spec.minimum, action_spec.maximum)

def denorm(a): #  use on model output before passing to env
    act_k = (action_spec.maximum - action_spec.minimum) / 2.
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return a * act_k + act_b

def norm(a): # use on env output before passing to model
    act_k_inv = 2. / (action_spec.maximum - action_spec.minimum)
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return act_k_inv * (a - act_b)

def parse(obs):
    """
    We lose information about the variables when we combine, for instance,
    velocity and position into 1 long array of observations. Could treating
    them separately lead to a better NN architecture? Velocity, position
    and numerical integration for instance take only ...
    """
    x = np.array([])
    for _, v in obs.items():
        x = np.append(x, v)
    return x

def off_policy():
    return np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape)

In [3]:
rewards = []
avg_rewards = []

for episode in tqdm(range(NUM_EPISODES)):
    time_step = env.reset()
    state = parse(time_step.observation)
    noise.reset()
    episode_reward = 0

    for step in range(DURATION):

        # initially, add only successful tasks to memory
        if episode < NUM_WARMUP:
            action = off_policy()
            time_step_2 = env.step(action)
            state_2 = parse(time_step_2.observation)
            if time_step_2.reward > 0:
                agent.memory.push(state, action, time_step_2.reward, state_2, -1)
            state = state_2

        else:
            action = agent.get_action(state)
            action = noise.get_action(action, step)
            time_step_2 = env.step(denorm(action))
            state_2 = parse(time_step_2.observation)
            agent.memory.push(state, action, time_step_2.reward, state_2, -1)
            state = state_2
        if len(agent.memory) > BATCH_SIZE:
            agent.update(BATCH_SIZE)
        episode_reward += time_step_2.reward
    if episode_reward > 0:
        print(f"episode: {episode}, "
          f"reward: {np.round(episode_reward, decimals=2)}, "
          f"average_reward: {np.mean(rewards[-10:])}")
    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

agent.save(PATH_MODEL)

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

  2%|▏         | 1148/50000 [00:02<01:30, 539.53it/s]


episode: 114, reward: 5.0, average_reward: 0.0
episode: 120, reward: 5.0, average_reward: 0.5
episode: 128, reward: 4.0, average_reward: 0.5
episode: 143, reward: 5.0, average_reward: 0.0
episode: 151, reward: 5.0, average_reward: 0.5
episode: 255, reward: 1.0, average_reward: 0.0
episode: 262, reward: 1.0, average_reward: 0.1
episode: 265, reward: 5.0, average_reward: 0.2
episode: 329, reward: 4.0, average_reward: 0.0
episode: 350, reward: 3.0, average_reward: 0.0
episode: 405, reward: 1.0, average_reward: 0.0
episode: 442, reward: 4.0, average_reward: 0.0
episode: 476, reward: 1.0, average_reward: 0.0
episode: 544, reward: 2.0, average_reward: 0.0
episode: 596, reward: 3.0, average_reward: 0.0
episode: 612, reward: 5.0, average_reward: 0.0
episode: 669, reward: 2.0, average_reward: 0.0
episode: 727, reward: 1.0, average_reward: 0.0
episode: 731, reward: 2.0, average_reward: 0.1
episode: 806, reward: 5.0, average_reward: 0.0
episode: 836, reward: 5.0, average_reward: 0.0
episode: 851,

KeyboardInterrupt: 

In [None]:
agent.load(PATH_MODEL)
# Define a uniform random policy.
def random_policy(time_step):
    state = parse(time_step.observation)
    return denorm(agent.get_action(state))

# Launch the viewer application.
viewer.launch(env, policy=random_policy)