In [277]:
from argparse import ArgumentParser
from collections import deque

import gymnasium as gym
import numpy as np
# import wandb

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR
from torch.distributions.categorical import Categorical
import torch.nn.functional as F
import torch.optim as optim

# import pytorch_lightning as pl

In [278]:
MODEL_PATH = 'ppomodel.pth'

In [279]:
def get_device():
    """Gets the device (GPU if any) and logs the type"""
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Found GPU device: {torch.cuda.get_device_name(device)}")
    else:
        device = torch.device("cpu")
        print("No GPU found: Running on CPU")
    return device

In [280]:
device = get_device()
device

No GPU found: Running on CPU


device(type='cpu')

In [281]:
env = gym.make('Pendulum-v1')

In [282]:
class PPO(nn.Module): # definie the policy network
    def __init__(self, state_size=4, action_size=2, hidden_size=32):
        """Initialize the policy network
        
        Args:
            state_size (int): Dimension of the state space
            action_size (int): Dimension of the action space
            hidden_size (int): Number of neurons in the hidden layers"""
        super(PPO, self).__init__()
        
        # Attributes
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size
                
        # Actor
        self.actor = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size),
            nn.Softmax(dim=-1)
        )
        
        # Value function
        self.critic = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, state):
        """Forward pass for both actor and critic
        
        Args:
            state (torch.Tensor): State tensor"""
        action = self.actor(state)
        value = self.critic(state)
        
        return action, value

    def act(self, state):
        """Take an action in the environment
        
        Args:
            state (np.array): Current state of the environment
            
        Returns:
            state (torch.Tensor): State tensor
            action (float): Action to take
            action_logtis (torch.Tensor): Logtis of the action
            value (torch.Tensor): Value of the state"""
        # Convert state to tensor for input of the network
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        # get the action and the value
        action_logtis, value = self.forward(state)
        model = Categorical(action_logtis)
        action = model.sample()
        return state, action.item(), action_logtis, value
    

## Get interactions with envrionment

In [283]:
@torch.no_grad()  # no gradient computation
def run_timesteps(env: gym.Env, model: PPO, max_t=1000) -> tuple:
    """Runs the given policy (model) on the environment for max_t timesteps
    
    Args:
        env (gym.Env): Environment to run the model on
        model (PPO): Policy to run
        max_t (int): Maximum number of timesteps to run
    
    Returns:
        actions (list): Actions taken by the model
        values (list): Values of the states
        rewards (list): Rewards obtained
        action_logits (list): Logits of the actions taken by the model
    """
    state = env.reset()[0]
    states = torch.Tensor(max_t, state.shape[0])
    actions = torch.Tensor(max_t)
    values = torch.Tensor(max_t)
    rewards = torch.Tensor(max_t)
    action_logits = torch.Tensor(max_t)
    for t in range(max_t):
        # Get the action, value and logtis
        input_state, action, action_logit, value = model.act(state)
        print(f"input_state: {input_state}, action: {action}, action_logit: {action_logit}, value: {value}")
        next_state, reward, done, _, _ = env.step([action])
        states[t] = (input_state)
        rewards[t] = (reward)
        action_logits[t] = (action_logit)
        actions[t] = (action)
        values[t] = (value)
        state = next_state
        if done:
            break
    return states, actions, values, rewards, action_logits

## Compute the Loss

In [284]:
def get_losses(model: PPO, states: torch.Tensor, actions: torch.Tensor, action_logits: torch.Tensor, cumulative_rewards: torch.Tensor, values: torch.Tensor, epsilon: float, annealing: float) -> tuple:
    
    """Compute the losses for the PPO algorithm
    
    Args:
        model (PPO): Policy to train
        states (torch.Tensor): States of the environment
        actions (torch.Tensor): Actions taken
        action_logits (torch.Tensor): Logits of the actions taken
        cumulative_rewards (torch.Tensor): Cumulative rewards
        values (torch.Tensor): Values of the states
        epsilon (float): Clipping parameter
        annealing (float): Annealing parameter
        
    Returns:
        l_clip (torch.Tensor): Loss on the state-action-function
        l_vf (torch.Tensor): Loss on the value-function
        entropy_bonus (torch.Tensor): Bonus for the entropy of the actor"""
    
    # Computing predictions with the new model
    _, new_logits, new_values = model(states)

    # Loss on the state-action-function / actor (L_CLIP)
    advantages = cumulative_rewards - values
    margin = epsilon * annealing
    ratios = new_logits.gather(1, actions) / action_logits.gather(1, actions)

    l_clip = torch.mean(
        torch.min(
            torch.cat(
                (ratios * advantages,
                 torch.clip(ratios, 1 - margin, 1 + margin) * advantages),
                dim=1),
            dim=1
        ).values
    )

    # Loss on the value-function / critic (L_VF)
    l_vf = torch.mean((cumulative_rewards - new_values) ** 2)

    # Bonus for entropy of the actor
    entropy_bonus = torch.mean(torch.sum(-new_logits * (torch.log(new_logits + 1e-5)), dim=1))

    return l_clip, l_vf, entropy_bonus

# Training loop

In [285]:
def train(env: gym.Env, model: PPO, optimizer, epsilon, n_epochs, n_actors, batch_size, lr, c1, c2, render=False, early_stop=False, n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=100)  # deque is a list-like container with fast appends and pops on either end
    scores = []
    anneals = np.linspace(1.0, 0.0, n_episodes)
    max_reward = -np.inf
    for iteration in range(1, n_episodes+1):
        print(f"Started iteration {iteration} / {n_episodes}")
        state = env.reset()
        
        buffer = []  # buffer = [[state], [action], [action_logit], [cumulative_rewards], [values
        for actor in range(n_actors):
            print(f"Running actor {actor + 1} / {n_actors}")
            # Collect trajectory for each actor
            state, action, values, rewards, action_logit = run_timesteps(env, model, max_t)
        
            # Calculate total expected reward
            scores_deque.append(sum(rewards))
            scores.append(sum(rewards))
            
            # Recalculate the total reward applying discounted factor
            discounts = [gamma ** i for i in range(len(rewards) + 1)]
            cum_discounted_rewards = [sum([discounts[j]*rewards[j+t] for j in range(len(rewards)-t) ]) for t in range(len(rewards))]

            # # compute Advantage
            # values = torch.cat(values)
            # advantages = cum_discounted_rewards - values
            
            buffer.append([state, action, action_logit, cum_discounted_rewards, values])
            print(f"Shape of buffer: {len(buffer[0])}")
            print(f"buffer: {buffer}")
        
        # Optimize loss over K epochs and minibatches
        for epoch in range(n_epochs):
            print(f"Optimizing epoch {epoch + 1} / {n_epochs}")
            for i in range(0, len(buffer)*max_t// batch_size):
                print(f"Optimizing minibatch {i + 1} / {len(buffer)// batch_size}")
                # Sample minibatch
                start = i * batch_size
                end = start + batch_size if start + batch_size < len(buffer) else len(buffer)
                # columns of buffer: state, action, action_logit, cum_discounted_rewards, values
                states = buffer[start:end][0]
                actions = buffer[start:end][1]
                action_logits = buffer[start:end][2]
                cumulative_rewards = buffer[start:end][3]
                values = buffer[start:end][4]
                               
                # Compute losses
                l_clip, l_vf, entropy_bonus = get_losses(model, states, actions, action_logits, cumulative_rewards, values, epsilon, anneals[iteration])
                
                # Optimize
                optimizer.zero_grad()
                loss = l_clip - c1*l_vf + c2*entropy_bonus
                loss.backward()
                optimizer.step()
        
        avg_rew = np.mean(scores_deque) 
        # Log scores
        if iteration % print_every == 0:
            curr_loss = loss.item()
            log = f"Iteration {iteration + 1} / {n_episodes}: " \
                    f"Average Reward: {avg_rew:.2f}\t" \
                    f"Loss: {curr_loss:.3f} " \
                    f"(L_CLIP: {l_clip.item():.1f} | L_VF: {l_vf.item():.1f} | L_bonus: {entropy_bonus.item():.1f})"
        
        if avg_rew > max_reward:
            torch.save(model.state_dict(), MODEL_PATH)
            max_reward = avg_rew
            log += " --> Stored model with highest average reward"
        if iteration % print_every == 0:
            print(log)
        
    return scores

In [286]:
model = PPO(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0]).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

scores = train(env, model, optimizer, epsilon=0.2, n_epochs=4, n_actors=4, batch_size=64, lr=1e-3, c1=1.0, c2=0.01, render=False, early_stop=False, n_episodes=1, max_t=1000, gamma=1.0, print_every=1)

Started iteration 1 / 1
Running actor 1 / 4
input_state: tensor([[ 0.8437, -0.5368, -0.2198]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.2601]])
input_state: tensor([[ 0.8266, -0.5628, -0.6224]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.3347]])
input_state: tensor([[ 0.7961, -0.6051, -1.0444]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.4136]])
input_state: tensor([[ 0.7486, -0.6630, -1.4983]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.4863]])
input_state: tensor([[ 0.6788, -0.7343, -1.9956]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.5620]])
input_state: tensor([[ 0.5801, -0.8145, -2.5463]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.6387]])
input_state: tensor([[ 0.4448, -0.8956, -3.1572]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.6958]])
input_state: tensor([[ 0.2663, -0.9639, -3.8289]]), action: 0, action_logit: tensor([[1.]]), value: tensor([[0.7499]])
inpu

KeyboardInterrupt: 