# DDPG with discontinuous state space to showcase discontinuity (spinup library)

In [1]:
import gym
from gym import spaces
from gym.utils import seeding
from gym.envs.registration import register

from custom_functions.custom_functions import env_fn 
from custom_functions.custom_functions import test_agent
from custom_functions.custom_functions import plot_test

import spinup

from os import path
from scipy.integrate import solve_ivp
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
#from vpython import *
from functools import partial
import matplotlib as matplt
matplt.use("pgf")
matplt.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})


register(id='gyroscopediscontinuousenv-v0',entry_point='gym_GyroscopeEnv.envs:GyroscopeDiscontinuousEnv')

## Training

#### Quadratic reward

In [3]:
# Setup baseline 0
logger_kwargs = dict(output_dir='ddpg_discontinuous_quadratic', exp_name='Discontinuity and quadratic reward function')
seed_b = 0
epochs_b = 100
maxeplen_b = 110

spe_b = maxeplen_b * 15
repsize_b = 1000000
gamma_b = 0.995
polyak_b = 0.995
batchsize_b = 100
startsteps_b = 10000
args_b = dict(hidden_sizes=[400], activation=torch.nn.ReLU)
actnoise_b = 0.1
pilr_b = 0.001
qlr_b = 0.001

# Env function
reward_args = {'qx1':9,'qx2':0.05,'qx3':9,'qx4':0.05,'pu1':0.1,'pu2':0.1}
env_fn = partial(env_fn,env_name = 'gyroscopediscontinuousenv-v0',reward_type = 'Quadratic', reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn, ac_kwargs = args_b, seed = seed_b, steps_per_epoch = spe_b, epochs = epochs_b, replay_size = repsize_b, gamma = gamma_b,
polyak = polyak_b, batch_size = batchsize_b, start_steps = startsteps_b, max_ep_len = maxeplen_b,logger_kwargs = logger_kwargs, act_noise = actnoise_b, pi_lr = pilr_b, q_lr = qlr_b)

[32;1mLogging data to ddpg_discontinuous/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            400
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7fda02cd5ea0>, env_name='gyroscopediscontinuousenv-v0', reward_type='Quadratic', reward_args={'qx1': 9, 'qx2': 0.05, 'qx3': 9, 'qx4': 0.05, 'pu1': 0.1, 'pu2': 0.1})",
    "epochs":	100,
    "exp_name":	"Quadratic reward function",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7fd9cbe53518>":	{
            "epoch_dict":	{},
            "exp_name":	"Quadratic reward function",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"ddpg_discontinuous",
            "output_file":	{
                "<_io.TextIOWrapper name='ddpg_discontinuous/progress.tx



---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |       -7.55e+03 |
|          StdEpRet |        1.33e+03 |
|          MaxEpRet |       -5.38e+03 |
|          MinEpRet |       -1.03e+04 |
|  AverageTestEpRet |        -7.6e+03 |
|      StdTestEpRet |        1.61e+03 |
|      MaxTestEpRet |       -4.45e+03 |
|      MinTestEpRet |        -9.3e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        1.65e+03 |
|      AverageQVals |            -129 |
|          StdQVals |            67.5 |
|          MaxQVals |            2.15 |
|          MinQVals |            -356 |
|            LossPi |             119 |
|             LossQ |        1.64e+03 |
|              Time |            5.41 |
---------------------------------------
---------------------------------------
|             Epoch |               2 |
|      AverageEpRet |       -7.51e+03 |
|          StdEpRet |        1.02e+03 |


---------------------------------------
|             Epoch |              11 |
|      AverageEpRet |       -1.37e+03 |
|          StdEpRet |             832 |
|          MaxEpRet |            -180 |
|          MinEpRet |       -3.06e+03 |
|  AverageTestEpRet |       -1.44e+03 |
|      StdTestEpRet |        1.17e+03 |
|      MaxTestEpRet |            -150 |
|      MinTestEpRet |       -3.95e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        1.81e+04 |
|      AverageQVals |       -2.26e+03 |
|          StdQVals |             734 |
|          MaxQVals |            -699 |
|          MinQVals |       -4.75e+03 |
|            LossPi |        2.24e+03 |
|             LossQ |         6.4e+03 |
|              Time |            95.4 |
---------------------------------------
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |       -1.43e+03 |
|          StdEpRet |        1.12e+03 |


---------------------------------------
|             Epoch |              21 |
|      AverageEpRet |       -1.36e+03 |
|          StdEpRet |             963 |
|          MaxEpRet |           -42.6 |
|          MinEpRet |       -4.25e+03 |
|  AverageTestEpRet |            -851 |
|      StdTestEpRet |             832 |
|      MaxTestEpRet |            -104 |
|      MinTestEpRet |       -3.07e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        3.46e+04 |
|      AverageQVals |       -2.22e+03 |
|          StdQVals |             740 |
|          MaxQVals |            -814 |
|          MinQVals |       -5.35e+03 |
|            LossPi |         2.2e+03 |
|             LossQ |        6.53e+03 |
|              Time |             174 |
---------------------------------------
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |       -1.12e+03 |
|          StdEpRet |        1.29e+03 |


---------------------------------------
|             Epoch |              31 |
|      AverageEpRet |       -1.34e+03 |
|          StdEpRet |        1.15e+03 |
|          MaxEpRet |            -232 |
|          MinEpRet |       -3.91e+03 |
|  AverageTestEpRet |       -1.01e+03 |
|      StdTestEpRet |             641 |
|      MaxTestEpRet |            -102 |
|      MinTestEpRet |       -2.25e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        5.11e+04 |
|      AverageQVals |       -1.77e+03 |
|          StdQVals |             660 |
|          MaxQVals |            -538 |
|          MinQVals |          -5e+03 |
|            LossPi |        1.76e+03 |
|             LossQ |        5.68e+03 |
|              Time |             253 |
---------------------------------------
---------------------------------------
|             Epoch |              32 |
|      AverageEpRet |            -748 |
|          StdEpRet |             365 |


---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |            -655 |
|          StdEpRet |             392 |
|          MaxEpRet |            -103 |
|          MinEpRet |       -1.69e+03 |
|  AverageTestEpRet |            -557 |
|      StdTestEpRet |             256 |
|      MaxTestEpRet |           -62.3 |
|      MinTestEpRet |            -872 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        6.76e+04 |
|      AverageQVals |       -1.29e+03 |
|          StdQVals |             517 |
|          MaxQVals |            -300 |
|          MinQVals |       -3.95e+03 |
|            LossPi |        1.28e+03 |
|             LossQ |        4.06e+03 |
|              Time |             333 |
---------------------------------------
---------------------------------------
|             Epoch |              42 |
|      AverageEpRet |            -407 |
|          StdEpRet |             297 |


---------------------------------------
|             Epoch |              51 |
|      AverageEpRet |            -848 |
|          StdEpRet |             676 |
|          MaxEpRet |           -44.6 |
|          MinEpRet |       -2.53e+03 |
|  AverageTestEpRet |       -1.01e+03 |
|      StdTestEpRet |             668 |
|      MaxTestEpRet |            -299 |
|      MinTestEpRet |       -2.59e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        8.41e+04 |
|      AverageQVals |            -964 |
|          StdQVals |             446 |
|          MaxQVals |            -156 |
|          MinQVals |       -3.37e+03 |
|            LossPi |             954 |
|             LossQ |        3.34e+03 |
|              Time |             411 |
---------------------------------------
---------------------------------------
|             Epoch |              52 |
|      AverageEpRet |            -531 |
|          StdEpRet |             589 |


---------------------------------------
|             Epoch |              61 |
|      AverageEpRet |            -880 |
|          StdEpRet |             685 |
|          MaxEpRet |            -284 |
|          MinEpRet |       -2.84e+03 |
|  AverageTestEpRet |            -482 |
|      StdTestEpRet |             363 |
|      MaxTestEpRet |            -158 |
|      MinTestEpRet |       -1.31e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        1.01e+05 |
|      AverageQVals |            -678 |
|          StdQVals |             397 |
|          MaxQVals |           -30.2 |
|          MinQVals |       -2.89e+03 |
|            LossPi |             669 |
|             LossQ |        2.51e+03 |
|              Time |             491 |
---------------------------------------
---------------------------------------
|             Epoch |              62 |
|      AverageEpRet |            -634 |
|          StdEpRet |             503 |


---------------------------------------
|             Epoch |              71 |
|      AverageEpRet |            -491 |
|          StdEpRet |             286 |
|          MaxEpRet |           -27.4 |
|          MinEpRet |       -1.13e+03 |
|  AverageTestEpRet |            -648 |
|      StdTestEpRet |             352 |
|      MaxTestEpRet |           -75.1 |
|      MinTestEpRet |       -1.28e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        1.17e+05 |
|      AverageQVals |            -433 |
|          StdQVals |             348 |
|          MaxQVals |            88.9 |
|          MinQVals |       -2.42e+03 |
|            LossPi |             423 |
|             LossQ |        2.02e+03 |
|              Time |             570 |
---------------------------------------
---------------------------------------
|             Epoch |              72 |
|      AverageEpRet |            -503 |
|          StdEpRet |             263 |


---------------------------------------
|             Epoch |              81 |
|      AverageEpRet |            -654 |
|          StdEpRet |             250 |
|          MaxEpRet |            -202 |
|          MinEpRet |       -1.04e+03 |
|  AverageTestEpRet |            -642 |
|      StdTestEpRet |             287 |
|      MaxTestEpRet |            -170 |
|      MinTestEpRet |       -1.13e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |        1.34e+05 |
|      AverageQVals |            -191 |
|          StdQVals |             290 |
|          MaxQVals |             297 |
|          MinQVals |        -2.1e+03 |
|            LossPi |             181 |
|             LossQ |        1.52e+03 |
|              Time |             648 |
---------------------------------------
---------------------------------------
|             Epoch |              82 |
|      AverageEpRet |            -634 |
|          StdEpRet |             424 |


---------------------------------------
|             Epoch |              91 |
|      AverageEpRet |            -823 |
|          StdEpRet |             335 |
|          MaxEpRet |            -296 |
|          MinEpRet |        -1.4e+03 |
|  AverageTestEpRet |            -740 |
|      StdTestEpRet |             330 |
|      MaxTestEpRet |            -236 |
|      MinTestEpRet |       -1.24e+03 |
|             EpLen |             110 |
|         TestEpLen |             110 |
| TotalEnvInteracts |         1.5e+05 |
|      AverageQVals |            37.3 |
|          StdQVals |             253 |
|          MaxQVals |             474 |
|          MinQVals |       -1.49e+03 |
|            LossPi |           -47.9 |
|             LossQ |        1.12e+03 |
|              Time |             727 |
---------------------------------------
---------------------------------------
|             Epoch |              92 |
|      AverageEpRet |            -723 |
|          StdEpRet |             352 |


#### Normalized reward

In [94]:
# Setup baseline 0
logger_kwargs = dict(output_dir='ddpg_discontinuous_normalized', exp_name='Discontinuity and normalized reward function')
seed_b = 0
epochs_b = 100
maxeplen_b = 110

spe_b = maxeplen_b * 15
repsize_b = 1000000
gamma_b = 0.995
polyak_b = 0.995
batchsize_b = 100
startsteps_b = 10000
args_b = dict(hidden_sizes=[400,], activation=torch.nn.ReLU)
actnoise_b = 0.1
pilr_b = 0.001
qlr_b = 0.001

# Env function
reward_args = {'k': 0.2}
env_fn = partial(env_fn,env_name = 'gyroscopediscontinuousenv-v0',reward_type = 'Normalized', reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn, ac_kwargs = args_b, seed = seed_b, steps_per_epoch = spe_b, epochs = epochs_b, replay_size = repsize_b, gamma = gamma_b,
polyak = polyak_b, batch_size = batchsize_b, start_steps = startsteps_b, max_ep_len = maxeplen_b,logger_kwargs = logger_kwargs, act_noise = actnoise_b, pi_lr = pilr_b, q_lr = qlr_b)

[32;1mLogging data to ddpg_discontinuous_normalized/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            400
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"functools.partial(functools.partial(functools.partial(<function env_fn at 0x7fda02cd5ea0>, env_name='gyroscopediscontinuousenv-v0', reward_type='Quadratic', reward_args={'qx1': 9, 'qx2': 0.05, 'qx3': 9, 'qx4': 0.05, 'pu1': 0.1, 'pu2': 0.1}), env_name='gyroscopediscontinuousenv-v0', reward_type='Normalized', reward_args={'k': 0.2}), env_name='gyroscopediscontinuousenv-v0', reward_type='Normalized', reward_args={'k': 0.2})",
    "epochs":	100,
    "exp_name":	"Discontinuity and normalized reward function",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7fd9cbc705c0>":	{
            "epoch_dict":	{},
            "exp_name":	"Discontinuity and normal



KeyboardInterrupt: 

## Test

In [3]:
# Test paramaters
env_name = 'gyroscopediscontinuousenv-v0'
reward_type = 'Quadratic'
reward_args = {'qx1':1,'qx2':0,'qx3':1,'qx4':0,'pu1':0,'pu2':0} # reward for mean squared error
seed = 2
agent_path = 'ddpg_discontinuous_normalized/pyt_save/model.pt'
init_state = [0,0,0,0,-np.pi,0,200*2*np.pi/60] 
t_end = 100*0.05

# Get Q-values (spinup implementation of ddpg core has been changed for that purpose by adding a method "crit"!)
obs_pi = [np.pi*0.95,-0.000005,0,0,-np.pi,0,20]
obs_minuspi = [-np.pi*0.95,-0.000005,0,0,-np.pi,0,20]
act_plus = [10,0] # positive voltage on u1 (for theta) to make it turn towards pi
act_minus = [0,0] # negative voltage on u1 (for theta) to make it turn towards -pi
agent = torch.load(agent_path)

Q_pi_plus = agent.crit(torch.as_tensor(obs_pi, dtype=torch.float32),torch.as_tensor(act_plus, dtype=torch.float32))
Q_pi_minus = agent.crit(torch.as_tensor(obs_pi, dtype=torch.float32),torch.as_tensor(act_minus, dtype=torch.float32))
print("Q-value for [u1,u2] = {} at state s = {}: {}".format(act_plus,obs_pi,Q_pi_plus))
print("Q-value for [u1,u2] = {} at state s = {}: {}".format(act_minus,obs_pi,Q_pi_minus))

Q_minuspi_plus = agent.crit(torch.as_tensor(obs_minuspi, dtype=torch.float32),torch.as_tensor(act_plus, dtype=torch.float32))
Q_minuspi_minus = agent.crit(torch.as_tensor(obs_minuspi, dtype=torch.float32),torch.as_tensor(act_minus, dtype=torch.float32))
print("Q-value for [u1,u2] = {} at state s = {}: {}".format(act_plus,obs_minuspi,Q_minuspi_plus))
print("Q-value for [u1,u2] = {} at state s = {}: {}".format(act_minus,obs_minuspi,Q_minuspi_minus))


# Perform testing
agent_path = 'ddpg_discontinuous_normalized'
r,score,x1_eval,x2_eval,x3_eval,x4_eval,x1_ref_eval,x3_ref_eval,act = test_agent(env_name,reward_type,reward_args,seed,agent_path,init_state,t_end,ep_len=110)

Q-value for [u1,u2] = [10, 0] at state s = [2.9845130209103035, -5e-06, 0, 0, -3.141592653589793, 0, 20]: -56.37056350708008
Q-value for [u1,u2] = [0, 0] at state s = [2.9845130209103035, -5e-06, 0, 0, -3.141592653589793, 0, 20]: -56.78080368041992
Q-value for [u1,u2] = [10, 0] at state s = [-2.9845130209103035, -5e-06, 0, 0, -3.141592653589793, 0, 20]: -54.60612869262695
Q-value for [u1,u2] = [0, 0] at state s = [-2.9845130209103035, -5e-06, 0, 0, -3.141592653589793, 0, 20]: -53.63178634643555
Total cumulative reward: -58.04309932253662





## Plot

In [14]:
time = np.linspace(0, t_end, len(x1_eval))

plt.plot(figsize=(15,22))
plt.title(r'',fontsize=20, y=1.05)
plt.xlabel('Time [s]',fontsize=16)
plt.ylabel('Angular position [°]',fontsize=16)
plt.ylim([-200,200])
plt.xlim([0,5])
plt.grid()


# RL Agent
plt.plot(time,180*(x1_eval)/np.pi,'r',label=r'$\theta$')

# Reference
plt.plot(time, 180*x1_ref_eval/np.pi, color='black',linestyle='dashed',label=r'Reference on $\theta$ at $\pm 180°$')
plt.plot(time, -180*x1_ref_eval/np.pi, color='black',linestyle='dashed')

plt.legend()
plt.savefig('discont.pgf')