## Control in a continuous action space with DDPG

### Heuristic policy

In [1]:
import gym 
import numpy as np
from helpers import NormalizedEnv, RandomAgent
from matplotlib import pyplot

In [2]:
# Initialization

env = gym.make("Pendulum-v1")
norm_env = NormalizedEnv(env) # accept actions between -1 and 1

rand_ag = RandomAgent(norm_env)

In [3]:
# one episode with a defined number of steps

def episode(agent): 
    state = norm_env.reset()
    tot_reward = 0
    truncated = False

    while not truncated:
        action = agent.compute_action(state)
        next_state, reward, terminated, truncated = norm_env.step(action)
        tot_reward += reward
        
        if truncated:
            state = norm_env.reset()
            
    return tot_reward

In [4]:
# Execute 10 episodes 

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(rand_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-982.6525518750062, -1614.0239791598237, -984.6043121408084, -1067.1312289357913, -1294.2684625996062, -1748.0844994982422, -1676.6533774278348, -1475.759416315718, -1615.3245147289317, -1274.5057212174845]
average cumulative reward : -1373.3008063899247


In [5]:
# Implementation of a heuristic policy for the pendulum

class HeuristicPendulumAgent:
    def __init__(self, env):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        
    def compute_action(self, state):
        _, y, v = state
        torque = env.m*env.g*env.l*y # fixed torque
        action = np.empty((1,))
        
        if (y < 0):
            np.append(action, np.sign(v)*torque) # same direction to angular velocity
        else:
            np.append(action, (-1)*np.sign(v)*torque) # opposite direction to angular velocity
        return action

In [6]:
# Execute 10 episodes with Heuristic agent
heur_ag = HeuristicPendulumAgent(norm_env)

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(heur_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-1638.8238179415841, -1636.068269287805, -1443.4334061756185, -1209.8513278098876, -1686.02165521129, -1105.9943857476815, -1631.3326299722437, -1643.5715076643319, -1687.6147857820656, -1355.7209200607801]
average cumulative reward : -1503.8432705653286
