## Control in a continuous action space with DDPG

### Heuristic policy

In [18]:
import gym 
import numpy as np
from helpers import NormalizedEnv, RandomAgent
from matplotlib import pyplot

In [19]:
# Initialization

env = gym.make("Pendulum-v1")
norm_env = NormalizedEnv(env) # accept actions between -1 and 1

rand_ag = RandomAgent(norm_env)

In [20]:
# one episode with a defined number of steps

def episode(agent): 
    state, info = norm_env.reset()
    tot_reward = 0
    truncated = False

    while not truncated:
        action = agent.compute_action(state)
        next_state, reward, terminated, truncated, info = norm_env.step(action)
        tot_reward += reward
        
        if truncated:
            state, info = norm_env.reset()
            
    return tot_reward

In [21]:
# Execute 10 episodes 

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(rand_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-1163.5321072544002, -1167.216006332102, -974.2845367391204, -1387.3297895213423, -1347.7154263775685, -1685.8919005564503, -900.598325969867, -1182.7732059302364, -1538.2811571975667, -1642.764573246596]
average cumulative reward : -1299.038702912525


In [5]:
# Implementation of a heuristic policy for the pendulum

class HeuristicPendulumAgent:
    def __init__(self, env):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        
    def compute_action(self, state):
        _, y, v = state
        torque = env.m*env.g*env.l*y # fixed torque
        action = np.empty((1,))
        
        if (y < 0):
            np.append(action, np.sign(v)*torque) # same direction to angular velocity
        else:
            np.append(action, (-1)*np.sign(v)*torque) # opposite direction to angular velocity
        return action

In [6]:
# Execute 10 episodes with Heuristic agent
heur_ag = HeuristicPendulumAgent(norm_env)

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(heur_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

initial state: [-0.92779315 -0.37309507 -0.17099966]
state: [-0.9411425  -0.33801004 -0.750821  ]
reward: -7.620384162314936
state: [-0.96116996 -0.27595705 -1.3043286 ]
reward: -7.882412885646909
state: [-0.98218876 -0.18789689 -1.8112963 ]
reward: -8.365213557258404
state: [-0.99708223 -0.07633518 -2.252219  ]
reward: -9.049762017995807
state: [-0.9985389   0.05403768 -2.6094704 ]
reward: -9.906596747300354
state: [-0.980558    0.19622944 -2.868942  ]
reward: -10.21776666582613
state: [-0.939852  0.341582 -3.02177 ]
reward: -9.494698974723043
state: [-0.87668025  0.4810735  -3.0655835 ]
reward: -8.717919509136221
state: [-0.7948004   0.60687095 -3.0047784 ]
reward: -7.911869507749309
state: [-0.700571   0.7135827 -2.849625 ]
reward: -7.104354818653501
state: [-0.6015783  0.7988138 -2.614438 ]
reward: -6.324415734496672
state: [-0.5052823   0.86295414 -2.315328  ]
reward: -5.5993892796620806
state: [-0.41805524  0.90842164 -1.9681122 ]
reward: -4.952197157658203
state: [-0.34474173  0

-664.5395914954541