In [1]:
import numpy as np
import math
import random
import gym
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import time

<h2>Use CUDA</h2>

In [2]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

cuda GeForce RTX 2060


<h2>Create Environments</h2>

In [3]:
env = gym.make('gym_turtlebot3.envs:turtlebot3-v0')

config: {'model': 'waffle_pi', 'worlds': {'turtlebot3_world': {'target': {'x': 2.0, 'y': 0.0, 'z': 0.0}, 'number_of_lasers': 18, 'default': {'position': {'x': -2.0, 'y': -0.5, 'z': 0.0}, 'quaternion_orientation': {'x': 2.0, 'y': 0.0, 'z': 5.0, 'w': 1.0}}}}}




# Define NN for PPO AC

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size1=128, hidden_size2=64, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size1),
            nn.LayerNorm(hidden_size1),
            nn.Tanh(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.LayerNorm(hidden_size2),
            nn.Tanh(),    
            nn.Linear(hidden_size2, 1),
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size1),
            nn.LayerNorm(hidden_size1),
            nn.Tanh(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.LayerNorm(hidden_size2),
            nn.Tanh(),    
            nn.Linear(hidden_size2, num_outputs),
            nn.Tanh()   
        )
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value

<h2>GAE</h2>

In [None]:
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

<h1> Proximal Policy Optimization Algorithm</h1>
<h2><a href="https://arxiv.org/abs/1707.06347">Arxiv</a></h2>

In [None]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

In [None]:
def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
    #loss_df = pd.DataFrame(columns = ["actor","critic","combined"])
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            #print(f"state shape: {state.shape}")
            # Eingefüte Zeule: Advantage normalization can später gelöschjt werden
            advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-10)
            #print(f"state: {state}")
            dist, value = model(state)
            #print(f"dist: {dist}")
            #print(f"value shape: {value.shape}")
            #print(f"value: {value}")
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)
            #print(f"entropy: {entropy}")
            #print(f"new_log_probs shape: {new_log_probs.shape}")
            #print(f"new_log_probs: {new_log_probs}")

            ratio = (new_log_probs - old_log_probs).exp()
            #print(f"ratio: {ratio}")
            surr1 = ratio * advantage
            #print(f"surr1: {surr1}")
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage
            #print(f"surr2 {surr2}")
            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()
            #print(f"actor_loss: {actor_loss}")
            #print(f"critic_loss: {critic_loss}")
            
            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
            #print(f"loss: {loss}")
            
            with open('loss.csv', 'a') as f:
                f.write(f'{actor_loss}, {critic_loss}, {loss}')
                f.write("\n")
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [None]:
num_inputs  = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]

#Hyper params:
hidden_size      = 256
lr               = 3e-4
num_steps        = 512
mini_batch_size  = 64
ppo_epochs       = 10
threshold_reward = -200

model = ActorCritic(num_inputs, num_outputs).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

model.load_state_dict(torch.load('model_100864_26-08-2021 07:36:01'))
model.eval()

In [None]:
max_frames = 15000000000000
frame_idx  = 0
test_rewards = []

In [None]:
state = np.expand_dims(env.reset(), axis=0)
early_stop = False

steps_so_far = []
rewards_so_far = []
df = pd.DataFrame(columns = ["step","reward"])

ac_loss = []
cr_loss = []
co_loss = [] 

while frame_idx < max_frames and not early_stop:
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device) 
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, info = env.step(action.cpu().numpy())
        done = np.expand_dims(np.array(done), axis=0)
        next_state = np.expand_dims(next_state, axis=0)
        
        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()
        
        log_probs.append(log_prob)
        values.append(value)
        reward = np.array([reward])
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        
        states.append(state)
        actions.append(action)
        
        state = next_state
        frame_idx += 1
        
         # Append number of steps so far\n",
        steps_so_far.append(len(steps_so_far) + 1)
        rewards_so_far.append(reward[0])
        
        """
        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env() for _ in range(10)])
            test_rewards.append(test_reward)
            plot(frame_idx, test_rewards)
            if test_reward > threshold_reward: early_stop = True
        """    
    env.step(np.array([[0.0, 0.0]]))
    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, rewards, masks, values)
    
    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    advantage = returns - values
    
    ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
    print(f"Updating network: {datetime.now().strftime('%d-%m-%Y %H:%M:%S')}")
    # Save model
    torch.save(model.state_dict(), f"model_{len(steps_so_far)}_{datetime.now().strftime('%d-%m-%Y %H:%M:%S')}")
    df = pd.DataFrame(list(zip(steps_so_far, rewards_so_far)),columns =['step', 'reward'])
    df.to_excel(f"versuch_1/log/{len(steps_so_far)}_{datetime.now().strftime('%d-%m-%Y %H:%M:%S')}.xlsx")