## Control in a continuous action space with DDPG

### Heuristic policy

In [1]:
import gym 
import numpy as np
from helpers import NormalizedEnv, RandomAgent
from matplotlib import pyplot

In [2]:
# Initialization

env = gym.make("Pendulum-v1")
norm_env = NormalizedEnv(env) # accept actions between -1 and 1

rand_ag = RandomAgent(norm_env)

In [3]:
# one episode with a defined number of steps

def episode(agent): 
    state, info = norm_env.reset()
    tot_reward = 0
    truncated = False

    while not truncated:
        action = agent.compute_action(state)
        next_state, reward, terminated, truncated, info = norm_env.step(action)
        tot_reward += reward
        
        if truncated:
            state, info = norm_env.reset()
            
    return tot_reward

In [4]:
# Execute 10 episodes 

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(rand_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-1170.3079063476066, -1572.9461396799957, -1082.5399920424336, -886.2308836962088, -956.6122258191956, -1530.8991488636425, -862.9167104440036, -1430.8567780097287, -1532.548876263106, -1749.4058851088723]
average cumulative reward : -1277.5264546274793


In [5]:
# Implementation of a heuristic policy for the pendulum

class HeuristicPendulumAgent:
    def __init__(self, env):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.env = env
        
    def compute_action(self, state):
        _, y, v = state
        torque = self.env.m*self.env.g*self.env.l*y # fixed torque
        action = np.empty((1,))
        
        if (y < 0):
            np.append(action, np.sign(v)*torque) # same direction to angular velocity
        else:
            np.append(action, (-1)*np.sign(v)*torque) # opposite direction to angular velocity
        return action

In [6]:
# Execute 10 episodes with Heuristic agent
heur_ag = HeuristicPendulumAgent(norm_env)

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(heur_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-1579.9818142632985, -1682.6525129540605, -1219.4709965626398, -842.8545238819726, -1630.1069162218646, -1381.0462443169642, -1662.4869628120468, -1655.7169532137818, -1122.723691151543, -1627.6598481532694]
average cumulative reward : -1440.470046353144


#### How does it compare with the reward of the random agent ?

We notice that the rewards at each episode are more stable than the ones while use the random agent. With the random agent, the reward fluctuates.

#### What impact does the amplitude of the fixed torque have on the reward ?

The amplitude of the torque gives the necessary force to the pendulum to be at least in the correct domain (upper domain). Therefore, the rewards are more stable at each episode. 