## Control in a continuous action space with DDPG

### Heuristic policy

In [1]:
import gym 
import numpy as np
from helpers import NormalizedEnv, RandomAgent
from matplotlib import pyplot
import torch

In [2]:
# Initialization

env = gym.make("Pendulum-v1")
norm_env = NormalizedEnv(env) # accept actions between -1 and 1

rand_ag = RandomAgent(norm_env)

In [3]:
# one episode with a defined number of steps

def episode(agent): 
    state, info = norm_env.reset()
    tot_reward = 0
    truncated = False

    while not truncated:
        action = agent.compute_action(state)
        next_state, reward, terminated, truncated, info = norm_env.step(action)
        tot_reward += reward
        
        if truncated:
            state, info = norm_env.reset()
            
    return tot_reward

In [4]:
# Execute 10 episodes 

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(rand_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-957.1909959975984, -815.4421460613864, -1564.6719613777366, -1758.497453884742, -1298.3150471935508, -1626.8967524486884, -936.8622448210249, -890.5947654293064, -990.9154222228802, -1169.0553501677819]
average cumulative reward : -1200.8442139604697


In [5]:
# Implementation of a heuristic policy for the pendulum
    
class HeuristicPendulumAgent:
    def __init__(self, env):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        
    def compute_action(self, state):
        _, y, v = state
        torque = env.m*env.g*env.l*y # fixed torque
        action = np.empty((1,))
        
        if (y < 0):
            np.append(action, np.sign(v)*torque) # same direction to angular velocity
        else:
            np.append(action, (-1)*np.sign(v)*torque) # opposite direction to angular velocity
        return action


In [6]:
# Execute 10 episodes with Heuristic agent
heur_ag = HeuristicPendulumAgent(norm_env)

arr_reward = []
num_ep = 10
for x in range(num_ep): 
    reward = episode(heur_ag)
    arr_reward.append(reward)

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

array of reward : [-1650.090527028244, -1372.780317373032, -1652.5517983907177, -1672.2371923926673, -1659.5844213967787, -1650.1917024590014, -1347.4898405672827, -1613.6774356330363, -1064.8552873614594, -1652.4848704942667]
average cumulative reward : -1533.5943393096488


## QNetwork

In [7]:
from qnetwork import *

In [8]:
import torch.optim as optim
gamma = 0.01
model = QNetwork(heur_ag, norm_env)

state, info = norm_env.reset()

action = heur_ag.compute_action(state)

action = torch.Tensor([action])

  action = torch.Tensor([action])


In [9]:
state_t = torch.Tensor([state, state, state])
action_t = torch.Tensor([action, action, action]).view(-1, 1)

transition = torch.cat([state_t, action_t], dim=1)
model.update(transition, gamma = gamma)

tensor(36.2026, grad_fn=<MseLossBackward0>)