# Implementation of Proximal Policy Optimization algorithm (PPO) - Paper: https://arxiv.org/abs/1707.06347¶

### Note: Implementation is based on Github-User 'higgsfield'. <br>See: https://github.com/higgsfield/RL-Adventure-2/blob/master/3.ppo.ipynb. <br> Pseudocode can be found on: https://spinningup.openai.com/en/latest/algorithms/ppo.html. <br> Instead of seperate actor and critic network a shared network is used.

In [None]:
import time
import os
import math
import random
import gym
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

<h2> Define Hyperparameters </h2>

In [None]:
HORIZON = 512 # Horizont (T) 
ADAM_LEARNING_RATE = 3e-4 # Adam Lernrate
NUM_EPOCHS = 10 # Anzahl der Epochen
MINI_BATCH_SIZE = 64 # Minibatch-Größe
GAMMA_DISCOUNT = 0.99 # Diskontierungsfaktor (γ) 
LAMBDA_GAE = 0.95 # GAE Parameter (λ)
C1_COEFFICIENT = 0.5 # C1 Koeffizent
C2_COEFFICIENT = 0.001 # C2 Koeffizent
CLIPPING = 0.2 # Clipping

<h2> Another needed Parameters </h2>

In [None]:
# If gpu or cpu is used
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"For the neural network: {DEVICE} is used.")
# Hidden size of neural network
HIDDEN_SIZE_1 = 128
HIDDEN_SIZE_2 = 64
# How long the algorithm should run
MAX_STEPS = 15000000000000
# Path to log th loss while running the algorithm. The Content in the file should be deleted for every run. 
# Else you have entrys from last run
LOSS_LOG_PATH = "./_logs/loss/loss.csv"
# Path to save model parameter while running the algorithm.
MODEL_PATH = "./_logs/models/"
# Path to save rewards while running the algorithm.
REWARDS_PATH = "./_logs/rewards/"

<h2>Initialize environment</h2>
If an error occurs. Make sure to change directory to gym_turtlebot3_wrapper and call pip install -e . <br>
For more information read: <a href="https://medium.com/@apoddar573/making-your-own-custom-environment-in-gym-c3b65ff8cdaa">medium.com</a>

In [None]:
env = gym.make('gym_turtlebot3.envs:turtlebot3-v0')

<h2> Define NN for PPO-Clip-Algorithm </h2>

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size1=HIDDEN_SIZE_1, hidden_size2=HIDDEN_SIZE_2, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size1),
            nn.LayerNorm(hidden_size1),
            nn.Tanh(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.LayerNorm(hidden_size2),
            nn.Tanh(),    
            nn.Linear(hidden_size2, 1),
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size1),
            nn.LayerNorm(hidden_size1),
            nn.Tanh(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.LayerNorm(hidden_size2),
            nn.Tanh(),    
            nn.Linear(hidden_size2, num_outputs),
            nn.Tanh()   
        )
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value

<h2>Generalized Advantage Estimation (GAE)</h2>
<h2><a href="https://arxiv.org/abs/1506.02438">Arxiv</a></h2>

In [None]:
def compute_gae(next_value, rewards, masks, values, gamma=GAMMA_DISCOUNT, tau=LAMBDA_GAE):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

<h1> Proximal Policy Optimization Algorithm</h1>
<h2><a href="https://arxiv.org/abs/1707.06347">Arxiv</a></h2>

In [None]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

In [None]:
def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=CLIPPING):
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            # Normalize Advantage
            advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-10)
            # Get distribution and value
            dist, value = model(state)
            # Get Entropy
            entropy = dist.entropy().mean()
            # Get log_prob
            new_log_probs = dist.log_prob(action)
            # Calculate ratio
            ratio = (new_log_probs - old_log_probs).exp()
            # Calculate clipping
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage
            # Calculate actor and critic loss
            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()
            # Calculate total loss            
            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
            
            # Write loss to log 
            with open(LOSS_LOG_PATH, 'a') as f:
                f.write(f'{actor_loss}, {critic_loss}, {loss}')
                f.write("\n")
                
            # Update neural network
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

<h2> Create neural network </h2>

In [None]:
# Get input shape for neural network (number of observations)
num_inputs  = env.observation_space.shape[0]
# Get output shape for neural netowrk (number of actions)
num_outputs = env.action_space.shape[0]

model = ActorCritic(num_inputs, num_outputs).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=ADAM_LEARNING_RATE)

<h2> Run PPO-Clip-Algorithm </h2>

In [None]:
frame_idx  = 0

# list for logging
steps_so_far = []
rewards_so_far = []
df = pd.DataFrame(columns = ["step","reward"])

# Run algorithm
state = np.expand_dims(env.reset(), axis=0)

while frame_idx < MAX_STEPS:
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(HORIZON):
        # Get State
        state = torch.FloatTensor(state).to(DEVICE) 
        # Get distribution and value
        dist, value = model(state)
        # Sample Action
        action = dist.sample()
        # Do step
        next_state, reward, done, info = env.step(action.cpu().numpy())
        # Save if done and next state
        done = np.expand_dims(np.array(done), axis=0)
        next_state = np.expand_dims(next_state, axis=0)
        # Get log_prob of action
        log_prob = dist.log_prob(action)
        # Get entropy of the distibution
        entropy += dist.entropy().mean()
        # Save Information about the step
        log_probs.append(log_prob)
        values.append(value)
        reward = np.array([reward])
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(DEVICE))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(DEVICE))
        states.append(state)
        actions.append(action)
        
        state = next_state
        frame_idx += 1
        
        # Append number of steps so far and rewards so far for logging,
        steps_so_far.append(len(steps_so_far) + 1)
        rewards_so_far.append(reward[0])
        
    # Set velocity and angular to zero while Updating network. 
    # If missing Turtlebot3 will drive while Updating the network, which can cause errors in the simulation 
    env.step(np.array([[0.0, 0.0]]))
    next_state = torch.FloatTensor(next_state).to(DEVICE)
    _, next_value = model(next_state)
    # Calculate Return with GAE
    returns = compute_gae(next_value, rewards, masks, values)
    
    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    # Calculate Advantage
    advantage = returns - values
    # Update neural network
    ppo_update(NUM_EPOCHS, MINI_BATCH_SIZE, states, actions, log_probs, returns, advantage)
    
    
    print(f"Updating network: {datetime.now().strftime('%d-%m-%Y %H:%M:%S')} Steps so far: {len(steps_so_far)}")
    # Save model    
    model_path = os.path.join(MODEL_PATH, f"model_{len(steps_so_far)}_{datetime.now().strftime('%d-%m-%Y %H:%M:%S')}")
    torch.save(model.state_dict(),model_path)
    # Save rewards
    df = pd.DataFrame(list(zip(steps_so_far, rewards_so_far)),columns =['step', 'reward'])
    rewards_path = os.path.join(REWARDS_PATH,f"{len(steps_so_far)}_{datetime.now().strftime('%d-%m-%Y %H:%M:%S')}.xlsx")
    df.to_excel(rewards_path)