In [2]:
import numpy as np
import matplotlib.pyplot as plt
from dm_control import suite, viewer
from tqdm import tqdm
from simulation.dm_control.ddpg.ddpg_classes.ddpg import DDPGagent, OUNoise
from simulation.dm_control.ddpg.ddpg_classes.utils import MemoryRank

In [3]:
random_state = np.random.RandomState(42)

RESUME_TRAINING = False
PATH_MODEL = 'cartpole'
TASK = 'balance'
NUM_EPISODES = 100
BATCH_SIZE = 128
DURATION = 50
ACTOR_LEARNING_RATE=1e-4
CRITIC_LEARNING_RATE=1e-3
GAMMA=0.99
TAU=1e-2

env = suite.load(PATH_MODEL, TASK, task_kwargs={'random': random_state})
action_spec = env.action_spec()
obs_spec = env.observation_spec()
dim_action = action_spec.shape[0]
dim_obs = sum(tuple(map(lambda x: int(np.prod(x.shape)), obs_spec.values())))

agent = DDPGagent(
    num_states=dim_obs,
    num_actions=dim_action,
    action_low= action_spec.minimum,
    action_high=action_spec.maximum,
    actor_learning_rate=ACTOR_LEARNING_RATE,
    critic_learning_rate=CRITIC_LEARNING_RATE,
    gamma=GAMMA,
    tau=TAU,
    memory=MemoryRank
)
if RESUME_TRAINING: agent.load(PATH_MODEL)

def denorm(a): #  use on model output before passing to env
    act_k = (action_spec.maximum - action_spec.minimum) / 2.
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return a * act_k + act_b

def norm(a): # use on env output before passing to model
    act_k_inv = 2. / (action_spec.maximum - action_spec.minimum)
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return act_k_inv * (a - act_b)

def parse(obs):
    """
    We lose information about the variables when we combine, for instance,
    velocity and position into 1 long array of observations. Could treating
    them separately lead to a better NN architecture? Velocity, position
    and numerical integration for instance take only ...
    """
    x = np.array([])
    for _, v in obs.items():
        x = np.append(x, v)
    return x

def off_policy():
    return np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape)

In [None]:
rewards = []
avg_rewards = []

for episode in tqdm(range(NUM_EPISODES)):
    time_step = env.reset()
    state = parse(time_step.observation)
    episode_reward = 0

    for step in range(DURATION):
        action = agent.get_action(state, t=step)
        time_step_2 = env.step(denorm(action))
        state_2 = parse(time_step_2.observation)
        agent.push(state, action, time_step_2.reward, state_2, -1)
        state = state_2
        if len(agent.memory) > BATCH_SIZE:
            agent.update(BATCH_SIZE)
        episode_reward += time_step_2.reward
    print(f"episode: {episode}, "
      f"reward: {np.round(episode_reward, decimals=2)}, "
      f"average_reward: {np.mean(rewards[-10:])}")
    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

agent.save(PATH_MODEL)

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

In [4]:
# Define a uniform random policy.
t = -1
def policy(time_step):
    global t
    t += 1
    state = parse(time_step.observation)
    action = agent.get_action(state, t)
    # print(time_step)
    action = denorm(action)
    # print(action)
    return action

# Launch the viewer application.
viewer.launch(env, policy=policy)