# Concentration Gradient Project

## 1. Imports

In [1]:
import torch as tr
import torch.nn as nn
from torch.optim.lr_scheduler import MultiStepLR
import matplotlib.pyplot as plt
import numpy as np
from environments import BoxEnvironment1 as env
from environment_utils import Box, Circle2D
from agents import SACAgent
from agent_utils import update_target_agent, ReplayBuffer
from log_utils import RLLogger
from plot_utils import RLPlotter, plot_normalized_concentration, make_animation

device = tr.device('cuda' if tr.cuda.is_available() else 'cpu')
tr.autograd.set_detect_anomaly(True)
tr.set_default_tensor_type(tr.FloatTensor)

## 2. Hyperparameters

In [37]:
# ---------------- Training -------------------
    # Memory
memory_size = 7000
memory_batch_size = 512
    # Duration of training
runs = 1
n_episodes = 50
n_steps = 256
    # Training parameters
agent_batch_size = 128
learning_rate_actor = 0.001
learning_rate_critic = 0.0001
milestones = np.arange(0, n_episodes, n_episodes)
learing_rate_decay = 0.9

entropy_coef = 0.01
entropy_coef_decay = 0.95
    # Bellman equation
future_discount = 0.99
    # Update Target Model
target_model_update = 5
    # Loss Function
loss_function = nn.MSELoss()

# ---------------- Environment  ----------------
    # Environment box size
env_width = 2
env_height = 2
space = Box(env_width, env_height)
    # Goal box size and center

goal_radius = 0.1
# goal_center = np.tile([0.5,0],(agent_batch_size,1))
# goal = Box(goal_width, goal_height, goal_center)
    # Time step size
dt = 0.05
    # Noise
noise_characteristic_length = 3
    # Maximum of potential
c0 = 2

# ---------------- Agent ----------------------
state_dim = 3
hidden_dims = [32,32,32]
act_dim = 1
act_positive = True
act_scaling = 2*np.pi

# ---------------- Other ----------------------
plt.rcParams.update({'font.size': 13})
plt.rcParams.update({'figure.dpi': 150})
total_time = []
update_state_time = []

## 3. Simulation

In [38]:
environment = env(space)
memory = ReplayBuffer(state_dim, act_dim, memory_size, agent_batch_size)
agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)
target_agent = SACAgent(state_dim, act_dim, hidden_dims, act_scaling, act_positive).float().to(device)

logger = RLLogger()
plotter = RLPlotter(logger, 'logs')
testLogger = RLLogger()
testPlotter = RLPlotter(testLogger, 'test_logs', test=True)

agent.actor_optimizer = tr.optim.Adam(agent.actor.parameters(), lr=learning_rate_actor)
agent.critic1_optimizer = tr.optim.Adam(agent.critic1.parameters(), lr=learning_rate_critic)
agent.critic2_optimizer = tr.optim.Adam(agent.critic2.parameters(), lr=learning_rate_critic)

scheduler_actor = MultiStepLR(agent.actor_optimizer, milestones=milestones, gamma=learing_rate_decay)
scheduler_critic1 = MultiStepLR(agent.critic1_optimizer, milestones=milestones, gamma=learing_rate_decay)
scheduler_critic2 = MultiStepLR(agent.critic2_optimizer, milestones=milestones, gamma=learing_rate_decay)

for p in target_agent.parameters():
    p.requires_grad = False

In [39]:
def update(agent, target_agent, memory_batch):
    agent.critic1_optimizer.zero_grad()
    agent.critic2_optimizer.zero_grad()

    state_now = memory_batch['state_now'].reshape(-1, state_dim)
    state_next = memory_batch['state_next'].reshape(-1, state_dim)
    action_now = memory_batch['action_now'].reshape(-1, act_dim)
    reward = memory_batch['reward'].reshape(-1)
    done = memory_batch['done'].reshape(-1)
    
    # Compute Prediction
    Q1_now_critic = agent.critic1(state_now, action_now)
    Q2_now_critic = agent.critic2(state_now, action_now)

    # Compute Target
    with tr.no_grad():        
        action_next_critic, log_prob_next_critic = agent.actor(state_next)
        
        Q1_next_critic = target_agent.critic1(state_next, action_next_critic)
        Q2_next_critic = target_agent.critic2(state_next, action_next_critic)
        Q_next_critic = tr.min(Q1_next_critic, Q2_next_critic)
        target_critic = reward + future_discount*(1-done)*(Q_next_critic - entropy_coef*log_prob_next_critic)
    # Compute Loss
    loss_critic = loss_function(Q1_now_critic, target_critic) + loss_function(Q2_now_critic, target_critic)
    
    # Update
    loss_critic.backward()
    agent.critic1_optimizer.step()
    agent.critic2_optimizer.step()
    
    agent.actor_optimizer.zero_grad()
    for p in agent.critic1.parameters():
        p.requires_grad = False
    for p in agent.critic2.parameters():
        p.requires_grad = False
    
    action_now_actor, log_prob_now_actor = agent.actor(state_now)
    Q1_now_actor = agent.critic1(state_now, action_now_actor)
    Q2_now_actor = agent.critic2(state_now, action_now_actor)
    Q_now_actor = tr.min(Q1_now_actor, Q2_now_actor)
    loss_actor = (entropy_coef*log_prob_now_actor - Q_now_actor).mean()
    loss_actor.backward()
    agent.actor_optimizer.step()

    for p in agent.critic1.parameters():
        p.requires_grad = True
    for p in agent.critic2.parameters():
        p.requires_grad = True

    return loss_critic, loss_actor

In [40]:
def episode():
    # Initialize Goal at Random Location
    # sample = space.sample()
    sample = np.array([0.5,0])
    goal_center = np.tile(sample,(agent_batch_size,1))
    goal = Circle2D(goal_radius, goal_center)

    environment.init_env(agent_batch_size, state_dim, goal, c0, random_start = False)
    plotter.update_goal(goal)
    goal_bool = False
    for current_step in range(n_steps):
        # Log state
        logger.save_state(environment.state)
        if current_step%target_model_update == 0 and current_step > memory_size:
            update_target_agent(agent, target_agent)
        # Beginning state
        state_now = environment.state
        # Action
        # if memory.size < memory_batch_size:
        #     action_now = 2*tr.pi*tr.rand(agent_batch_size, act_dim, device=device, dtype=tr.float)
        # else:
        action_now, _ = agent.actor(tr.as_tensor(environment.state, device=device, dtype=tr.float))
        # Next state
        reward = environment.step(action_now.detach().cpu().numpy(), c0, dt, noise_characteristic_length)
        state_next = environment.state
        # Done
        done = environment.goal_check()
        # Log action
        logger.save_action(action_now.detach().cpu().numpy())

        loss = 0
        # Sample from memory
        if memory.size >= memory_batch_size:
            
            memory_batch = memory.sample_batch(memory_batch_size)

            # Update Agent
            loss_critic, loss_actor = update(agent, target_agent, memory_batch)
            loss_critic, loss_actor = loss_critic.item(), loss_actor.item()
            logger.save_loss_critic(loss_critic)
            logger.save_loss_actor(loss_actor)
        
        # Store in memory
        memory.store((state_now[:,-1])[:,None], action_now, reward, (state_next[:,-1])[:,None], loss, done)
        
        if max(environment.goal_check()):
            goal_bool = True
        #     print('Goal reached')
        #     logger.save_state(environment.state)
            
        #     break


    return current_step, goal_bool

In [41]:
def test_episode():
    # Initialize Goal at Random Location
    # sample = space.sample()
    sample = np.array([0.5,0])
    goal_center = np.tile(sample,(1,1))
    goal = Circle2D(goal_radius, goal_center)

    environment.init_env(1, state_dim, goal, c0, random_start = False)
    testPlotter.update_goal(goal)
    testLogger.save_state(environment.state)
    for current_step in range(n_steps):
      
        # Action
        action_now = agent.act(tr.as_tensor(environment.state, device=device, dtype=tr.float), deterministic=True)
        environment.step(action_now, c0, dt, noise_characteristic_length, test = True)

        # Log Action and State
        testLogger.save_action(action_now)
        testLogger.save_state(environment.state)
            
    return current_step

In [42]:
def simulation():
    update_target_agent(agent, target_agent)
    for ep in range(n_episodes):
        # if ep%(n_episodes//10) == 0:
        #     entropy_coef = entropy_coef * entropy_coef_decay
        episode_steps, goal_bool = episode()
        if goal_bool:
            print('Goal reached!')
            global entropy_coef
            entropy_coef = entropy_coef * entropy_coef_decay
            print(entropy_coef)

        logger.save_episode(episode_steps)
        plotter.plot_last_episode()
        

        test_episode_steps = test_episode()
        testLogger.save_episode(test_episode_steps)
        testPlotter.plot_last_episode()        
        print('Episode', ep,' finished!')
        if memory.size > memory_batch_size:
            scheduler_actor.step()
            scheduler_critic1.step()
            scheduler_critic2.step()
        
plotter.clear_plots('logs')
testPlotter.clear_plots('test_logs')

simulation()

Goal reached!
0.0095
Episode 0  finished!
Goal reached!
0.009025
Episode 1  finished!
Goal reached!
0.00857375
Episode 2  finished!
Goal reached!
0.0081450625
Episode 3  finished!
Goal reached!
0.007737809374999999
Episode 4  finished!
Goal reached!
0.007350918906249998
Episode 5  finished!
Goal reached!
0.006983372960937498
Episode 6  finished!
Goal reached!
0.006634204312890623
Episode 7  finished!
Goal reached!
0.006302494097246091
Episode 8  finished!
Goal reached!
0.005987369392383786
Episode 9  finished!
Goal reached!
0.005688000922764597
Episode 10  finished!
Goal reached!
0.005403600876626367
Episode 11  finished!
Goal reached!
0.005133420832795048
Episode 12  finished!
Goal reached!
0.0048767497911552955
Episode 13  finished!
Goal reached!
0.00463291230159753
Episode 14  finished!
Goal reached!
0.0044012666865176535
Episode 15  finished!
Goal reached!
0.004181203352191771
Episode 16  finished!
Goal reached!
0.003972143184582182
Episode 17  finished!
Goal reached!
0.00377353602

## Animation

In [2]:
make_animation('logs/episode_paths',3)
make_animation('test_logs/episode_paths',3)

Moviepy - Building video logs/episode_paths_animation.mp4.
Moviepy - Writing video logs/episode_paths_animation.mp4



                                                            

Moviepy - Done !
Moviepy - video ready logs/episode_paths_animation.mp4
Moviepy - Building video test_logs/episode_paths_animation.mp4.
Moviepy - Writing video test_logs/episode_paths_animation.mp4



                                                            

Moviepy - Done !
Moviepy - video ready test_logs/episode_paths_animation.mp4
