## Control in a continuous action space with DDPG

### Heuristic policy

In [5]:
import gym 
import numpy as np
from helpers import NormalizedEnv, RandomAgent
from matplotlib import pyplot

In [6]:
# Initialization

env = gym.make("Pendulum-v1")
norm_env = NormalizedEnv(env) # accept actions between -1 and 1

rand_ag = RandomAgent(norm_env)

In [13]:
# one episode with a defined number of steps

def episode(agent): 
    state, info = norm_env.reset()
    tot_reward = 0
    truncated = False

    while not truncated:
        action = agent.compute_action(state)
        next_state, reward, terminated, truncated, info = norm_env.step(action)
        tot_reward += reward
        
        if truncated:
            state, info = norm_env.reset()
            
    return tot_reward

In [14]:
# Execute 10 episodes 

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(rand_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-1034.1114585696762, -1284.5110112181655, -891.9493067719303, -1161.0561444634814, -1395.9371514588634, -1695.1204312216835, -1173.4723828199124, -1727.0671849252008, -1187.339686815846, -878.7176400864324]
average cumulative reward : -1242.9282398351193


In [15]:
# Implementation of a heuristic policy for the pendulum

class HeuristicPendulumAgent:
    def __init__(self, env):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        
    def compute_action(self, state):
        _, y, v = state
        torque = env.m*env.g*env.l*y # fixed torque
        action = np.empty((1,))
        
        if (y < 0):
            np.append(action, np.sign(v)*torque) # same direction to angular velocity
        else:
            np.append(action, (-1)*np.sign(v)*torque) # opposite direction to angular velocity
        return action

In [16]:
# Execute 10 episodes with Heuristic agent
heur_ag = HeuristicPendulumAgent(norm_env)

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(heur_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-1579.682712767844, -1683.7214117695607, -1322.168176344182, -845.5593601845148, -1664.4463129102007, -1676.7009767766742, -1678.420427318271, -1654.9513612868668, -1457.052430595924, -1682.3992653853645]
average cumulative reward : -1524.5102435339402
