# First RL Gym Project

## 1. Imports

In [1]:
import torch as tr
import torch.nn as nn
from torch.optim.lr_scheduler import MultiStepLR
import matplotlib.pyplot as plt
import numpy as np
from environments import BoxEnvironment1 as env
from environment_utils import Box
from agents import SACAgent
from agent_utils import update_target_agent, ReplayBuffer
from log_utils import RLLogger
from plot_utils import RLPlotter, plot_normalized_mexican_hat_potential

device = tr.device('cuda' if tr.cuda.is_available() else 'cpu')
tr.autograd.set_detect_anomaly(True)
tr.set_default_tensor_type(tr.FloatTensor)

## 2. Hyperparameters

In [2]:
# ---------------- Training -------------------
    # Memory
memory_size = 64
memory_batch_size = 16 
    # Duration of training
runs = 1
n_episodes = 20
n_steps = 128 + memory_size
    # Training parameters
agent_batch_size = 1
learning_rate = 0.001
learning_rate_decay = 0.8
entropy_coef = 0.2
    # Bellman equation
future_discount = 0.8
    # Update Target Model
target_model_update = 16
    # Loss Function
loss_function = nn.MSELoss()

# ---------------- Environment  ----------------
    # Environment box size
env_width = 2
env_height = 2
space = Box(env_width, env_height)
    # Goal box size and center
goal_width = 0.2
goal_height = 0.2
goal_center = np.tile([0.5,0],(agent_batch_size,1))
goal = Box(goal_width, goal_height, goal_center)
    # Time step size
dt = 0.0375
    # Noise
noise_characteristic_length = 1
    # Maximum of potential
U0 = 0.4

# ---------------- Agent ----------------------
state_dim = 5
hidden_dims = [16,16]
act_dim = 1
act_positive = True
act_scaling = 2*np.pi

# ---------------- Other ----------------------
plt.rcParams.update({'font.size': 13})
plt.rcParams.update({'figure.dpi': 150})
total_time = []
update_state_time = []

## 3. Simulation

In [3]:
environment = env(space, goal)
memory = ReplayBuffer(state_dim, act_dim, memory_size, agent_batch_size)
agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
target_agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
logger = RLLogger()
plotter = RLPlotter(logger)

agent.actor_optimizer = tr.optim.Adam(agent.actor.parameters(), lr=learning_rate)
agent.critic1_optimizer = tr.optim.Adam(agent.critic1.parameters(), lr=learning_rate)
agent.critic2_optimizer = tr.optim.Adam(agent.critic2.parameters(), lr=learning_rate)

for p in target_agent.parameters():
    p.requires_grad = False

In [4]:
def update(agent, target_agent, memory_batch):
    state_now = memory_batch['state_now']
    state_next = memory_batch['state_next']
    action_now = memory_batch['action_now']
    reward = memory_batch['reward']
    done = memory_batch['done']
    action_next, log_prob_next = agent.actor(state_next)
    
    # Compute Prediction
    Q1_now = agent.critic1(state_now, action_now)
    Q2_now = agent.critic2(state_now, action_now)
    Q_now = tr.min(Q1_now, Q2_now)

    # Compute Target
    Q1_next = target_agent.critic1(state_next, action_next)
    Q2_next = target_agent.critic2(state_next, action_next)
    Q_next = tr.min(Q1_next, Q2_next)
    Target = reward + future_discount*(Q_next - entropy_coef*log_prob_next)
     
    # Compute Loss
    loss = loss_function(Q_now, Target)
    
    # Update
    loss.backward()
    agent.critic1_optimizer.step()
    agent.critic2_optimizer.step()
    agent.actor_optimizer.step()

    return loss

In [5]:
def episode():    
    environment.init_state(agent_batch_size)
    for current_step in range(n_steps):
        logger.save_state(environment.state)
        if current_step%target_model_update == 0 and current_step > memory_size:
            update_target_agent(agent, target_agent)
        # Beginning state
        state_now = environment.state
        # Action
        action_now, _ = agent.actor(tr.as_tensor(environment.state, device=device, dtype=tr.float))
        # Next state
        environment.step(action_now.detach().cpu().numpy(), U0, dt, noise_characteristic_length)
        state_next = environment.state
        # Reward
        reward = environment.reward(dt)
        # Done
        done = environment.goal_check()
        # Store in memory
        memory.store(state_now, action_now, reward, state_next, done)

        logger.save_action(action_now.detach().cpu().numpy())

        # Sample from memory
        if current_step > memory_size:
            memory_batch = memory.sample_batch(memory_batch_size)
            # Update Agent
            loss = update(agent, target_agent, memory_batch)
            logger.save_loss(loss.item())

        if max(environment.goal_check()): 
            print('Goal reached')
            print(f'Enviroment State: {environment.state}')
            logger.save_state(environment.state)
            break
        
    return current_step

In [6]:
def simulation():
    update_target_agent(agent, target_agent)
    for ep in range(n_episodes):
        episode_steps = episode()
        logger.save_episode(episode_steps)
        plotter.plot_last_episode()
        print('Episode', ep,' finished!')

plotter.clear_plots()
simulation()

Episode 0  finished!


KeyboardInterrupt: 

<Figure size 960x720 with 0 Axes>

## Plotting


In [None]:
plotter = RLPlotter(logger)

std_x = 0.2, std_y = 0.3
std_x = 0.1, std_y = 0.4
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.7
std_x = 0.1, std_y = 0.6
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.4
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.3
std_x = 0.1, std_y = 0.2
std_x = 0.1, std_y = 0.2
std_x = 0.4, std_y = 0.5
std_x = 0.4, std_y = 0.2
std_x = 0.5, std_y = 0.2
std_x = 0.4, std_y = 0.2
std_x = 0.5, std_y = 0.2
std_x = 0.4, std_y = 0.3
std_x = 0.4, std_y = 0.2
std_x = 0.4, std_y = 0.2
std_x = 0.4, std_y = 0.2
std_x = 0.3, std_y = 0.2


<Figure size 960x720 with 0 Axes>