# First RL Gym Project

## 1. Imports

In [1]:
import torch as tr
import torch.nn as nn
from torch.optim.lr_scheduler import MultiStepLR
import matplotlib.pyplot as plt
import numpy as np
from environments import BoxEnvironment1 as env
from environment_utils import Box
from agents import SACAgent
from agent_utils import update_target_agent, ReplayBuffer
from log_utils import RLLogger
from plot_utils import RLPlotter, plot_normalized_mexican_hat_potential

device = tr.device('cuda' if tr.cuda.is_available() else 'cpu')
tr.autograd.set_detect_anomaly(True)
tr.set_default_tensor_type(tr.FloatTensor)

## 2. Hyperparameters

In [2]:
# ---------------- Training -------------------
    # Memory
memory_size = 3000
memory_batch_size = 512
    # Duration of training
runs = 1
n_episodes = 50
n_steps = 256
    # Training parameters
agent_batch_size = 128
learning_rate = 0.005
learning_rate_decay = 0.8
entropy_coef = 0.5
    # Bellman equation
future_discount = 0.8
    # Update Target Model
target_model_update = 32
    # Loss Function
loss_function = nn.MSELoss()

# ---------------- Environment  ----------------
    # Environment box size
env_width = 2
env_height = 2
space = Box(env_width, env_height)
    # Goal box size and center
goal_width = 0.3
goal_height = 0.3
goal_center = np.tile([0.5,0],(agent_batch_size,1))
goal = Box(goal_width, goal_height, goal_center)
    # Time step size
dt = 0.04
    # Noise
noise_characteristic_length = 10
    # Maximum of potential
U0 = 0.5

# ---------------- Agent ----------------------
state_dim = 4
hidden_dims = [16,16,16]
act_dim = 1
act_positive = True
act_scaling = 2*np.pi

# ---------------- Other ----------------------
plt.rcParams.update({'font.size': 13})
plt.rcParams.update({'figure.dpi': 150})
total_time = []
update_state_time = []

## 3. Simulation

In [3]:
environment = env(space, goal)
memory = ReplayBuffer(state_dim, act_dim, memory_size, agent_batch_size)
agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
target_agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
logger = RLLogger()
plotter = RLPlotter(logger, goal)

agent.actor_optimizer = tr.optim.Adam(agent.actor.parameters(), lr=learning_rate)
agent.critic1_optimizer = tr.optim.Adam(agent.critic1.parameters(), lr=learning_rate)
agent.critic2_optimizer = tr.optim.Adam(agent.critic2.parameters(), lr=learning_rate)

for p in target_agent.parameters():
    p.requires_grad = False

In [4]:
def update(agent, target_agent, memory_batch):

    state_now = memory_batch['state_now'].reshape(-1, state_dim)
    state_next = memory_batch['state_next'].reshape(-1, state_dim)
    action_now = memory_batch['action_now'].reshape(-1, act_dim)
    reward = memory_batch['reward'].reshape(-1)
    done = memory_batch['done'].reshape(-1)
    action_next, log_prob_next = agent.actor(state_next)
    
    # Compute Prediction
    Q1_now = agent.critic1(state_now, action_now)
    Q2_now = agent.critic2(state_now, action_now)

    # Compute Target
    Q1_next = target_agent.critic1(state_next, action_next)
    Q2_next = target_agent.critic2(state_next, action_next)
    Q_next = tr.min(Q1_next, Q2_next)
    target = reward + future_discount*(1-done)*(Q_next - entropy_coef*log_prob_next)
    print(f'Target: {target[0]}')
    print(f'Reward: {reward[0]}')
    print(f'Done: {done[0]}')
    print(f'Q_next: {Q_next[0]}')
    print(f'Q1_next: {Q1_next[0]}')
    print(f'Q2_next: {Q2_next[0]}')
    print(f'Future_discount*Q_next: {future_discount*Q_next[0]*(1-done[0])}')
    print(f'Entropy: {future_discount*(1-done[0])*entropy_coef*log_prob_next[0]}')
    print('-----------------')
    # Compute Loss
    loss = loss_function(Q1_now, target) + loss_function(Q2_now, target)
    target = target.detach()
    # Update
    loss.backward()
    agent.critic1_optimizer.step()
    agent.critic2_optimizer.step()
    agent.actor_optimizer.step()

    return loss

In [5]:
def episode():    
    environment.init_state(agent_batch_size, state_dim)
    for current_step in range(n_steps):
        # Log state
        logger.save_state(environment.state)
        if current_step%target_model_update == 0 and current_step > memory_size:
            update_target_agent(agent, target_agent)
        # Beginning state
        state_now = environment.state
        # Action
        action_now, _ = agent.actor(tr.as_tensor(environment.state, device=device, dtype=tr.float))
        # Next state
        reward = environment.step(action_now.detach().cpu().numpy(), U0, dt, noise_characteristic_length)
        state_next = environment.state
        # Done
        done = environment.goal_check()
        # Log action
        logger.save_action(action_now.detach().cpu().numpy())

        loss = 0
        # Sample from memory
        if memory.size >= memory_batch_size:
            memory_batch = memory.sample_batch(memory_batch_size)
            # Update Agent
            loss = update(agent, target_agent, memory_batch).item()
            logger.save_loss(loss)
        
        # Store in memory
        memory.store(state_now, action_now, reward, state_next, loss, done)

        if max(environment.goal_check()): 
            print('Goal reached')
            logger.save_state(environment.state)
            break
        
    return current_step

In [6]:
def simulation():
    update_target_agent(agent, target_agent)
    for ep in range(n_episodes):
        episode_steps = episode()
        logger.save_episode(episode_steps)
        plotter.plot_last_episode()
        print('Episode', ep,' finished!')

plotter.clear_plots()
simulation()

Goal reached
Episode 0  finished!
Goal reached
Episode 1  finished!
q: tensor([-0.0026], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-0.0003], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-0.0030], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-0.0005], device='cuda:0', grad_fn=<SelectBackward0>)
Target: 10.192426681518555
Reward: 9.960000038146973
Done: 0.0
Q_next: -0.002987565705552697
Q1_next: -0.002987565705552697
Q2_next: -0.000498102162964642
Future_discount*Q_next: -0.002390052657574415
Entropy: -0.23481665551662445
-----------------
q: tensor([-0.0018], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([0.0521], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-0.0029], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-0.0005], device='cuda:0', grad_fn=<SelectBackward0>)
Target: 10.203585624694824
Reward: 9.960000038146973
Done: 0.0
Q_next: -0.002934227464720607
Q1_next: -0.002934227464720607
Q2_next: -0.0004988206783309579
Future_d

  fig = plt.figure()


Episode 19  finished!
q: tensor([-28.3538], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-5.7353], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-0.0022], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([0.0341], device='cuda:0', grad_fn=<SelectBackward0>)
Target: -24.297101974487305
Reward: -0.03999999910593033
Done: 0.0
Q_next: -0.002194654429331422
Q1_next: -0.002194654429331422
Q2_next: 0.034078508615493774
Future_discount*Q_next: -0.0017557236133143306
Entropy: 24.255346298217773
-----------------
q: tensor([-29.1999], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-18.2691], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([-0.0022], device='cuda:0', grad_fn=<SelectBackward0>)
q: tensor([0.0328], device='cuda:0', grad_fn=<SelectBackward0>)
Target: -23.801069259643555
Reward: -0.03999999910593033
Done: 0.0
Q_next: -0.0022250704932957888
Q1_next: -0.0022250704932957888
Q2_next: 0.03279666602611542
Future_discount*Q_next: -0.001780056394636631
E

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>

<Figure size 960x720 with 0 Axes>