In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

# seeds
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [None]:
from torch.distributions.categorical import Categorical

class ActorCritic(nn.Module):
    def __init__(self, num_states, num_actions, activation=nn.Tanh):
        super().__init__()
        self.actor = nn.Sequential(
                        nn.Linear(num_states, 64),
                        activation(),
                        nn.Linear(64, 64),
                        activation(),
                        nn.Linear(64, num_actions),
                        nn.Softmax(dim=-1),
                        )
        self.critic = nn.Sequential(
                        nn.Linear(num_states, 64),
                        activation(),
                        nn.Linear(64, 64),
                        activation(),
                        nn.Linear(64, 1),
                        )
    
    def action_prob(self, x):
        '''
        Calculate the actions, and return the actions and the log probalilities
        '''
        probs = self.actor(x)
        dist = Categorical(probs)
        actions = dist.sample() 
        return actions, dist.log_prob(actions)
    
    def get_prob_and_entropy_from_action(self, x, actions):
        '''
        Return the log probabilities based on states (x) and actions
        '''
        probs = self.actor(x)
        dist = Categorical(probs)
        return dist.log_prob(actions), dist.entropy()
    
    def get_v(self, x):
        '''
        Return the critic value
        '''
        return self.critic(x)

In [None]:
PARALLEL_AGENTS = 16
MAX_TRAJECTORY = 1024
BATCH_SIZE = 32
GAMMA = 0.99
EPOCHS = 5
V_LOSS_FACTOR = 0.5
H_LOSS_FACTOR = 0.01
EPSILON = 0.2

SEED = 69 # for gym

In [None]:
# initialize the environment
env = gym.vector.make("LunarLander-v2", num_envs=PARALLEL_AGENTS)
states, _ = env.reset(seed=SEED)
num_states = states.shape[1]
num_actions = env.action_space[0].n

# initialize the model
policy = ActorCritic(num_states, num_actions).to(device)

optimizer = optim.Adam([
    {"params": policy.actor.parameters(), 'lr': 1e-3},
    {"params": policy.critic.parameters(), 'lr': 1e-3},
])

In [None]:
def get_clip_loss(pt, advantage, epsilon):
    # Calculates the clip loss
    return torch.mean(torch.min(advantage * torch.clip(pt, min=1-epsilon, max=1+epsilon),
                                advantage * pt
                               )
                     )

    
for episode in range(200):
    states, _ = env.reset(seed=SEED)
    
    '''
    Run the simulation to collect data.
    
    The vector environments autoreset sub-environments after they terminate or truncated,
    so we don't need to reset all the environments.
    '''
    temp_episode_mem = []
    rewards_sum = 0.
    for t in range(MAX_TRAJECTORY):
        # get next action
        states = torch.tensor(states, dtype=torch.float32, device=device)
        with torch.no_grad():
            actions, log_probs = map(lambda x: x.cpu().numpy(), 
                                     policy.action_prob(states),
                                    )
        new_states, rewards, terminated, truncated, _ = env.step(actions)
        done = torch.tensor([x or y for x,y in zip(terminated, truncated)], 
                            dtype=torch.float32, 
                            device=device).unsqueeze(-1)
        temp_episode_mem.append((states, actions, rewards, new_states, log_probs, done))
        
        # hold the rewards for logging
        rewards_sum += sum(rewards)
        
        # update the states
        states = new_states
            
    '''
    Prepare the data for training after the simulation was finished
    '''
    # hold the number of the completed and none-completed runs
    completed_runs = 0
    none_completed_runs = int(PARALLEL_AGENTS - torch.sum(temp_episode_mem[-1][-1]).item())
    
    # hold the sum of rewards of the completed runs
    completed_rewards = 0.
    completed_mask = temp_episode_mem[-1][-1]
    training_batch = []
    
    new_states = torch.tensor(new_states, dtype=torch.float32, device=device)
    with torch.no_grad():
        v_target = policy.get_v(new_states)

    for states, actions, rewards, new_states, log_probs, done in reversed(temp_episode_mem):
        # reset v_target when the episode was finished
        v_target = v_target * (1.-done)
        
        rewards = torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(-1)
        v_target = rewards + v_target*GAMMA

        with torch.no_grad():
            v_omega = policy.get_v(states)

        advantage = v_target - v_omega

        # add the data to the training batch
        training_batch.extend(zip(states, actions, rewards, log_probs, advantage, v_target))
        
        # log completed runs
        completed_runs += int(torch.sum(done).item())
    
        # log the completed rewards
        completed_mask = torch.logical_or(completed_mask, done)
        completed_rewards += torch.sum(rewards * completed_mask).item()
    
    # save some memory
    del temp_episode_mem
    
    '''
    Train the model
    '''
    loss_list, loss_list_clip, loss_list_v, loss_list_h = [], [], [], []
    for epoch in range(EPOCHS):
        # shuffle the training data
        random.shuffle(training_batch)
        
        for i in range(0, len(training_batch), BATCH_SIZE):
            batch = training_batch[i:i+BATCH_SIZE]
            states, actions, _, log_probs, advantage, v_target = zip(*batch)
            
            # to tensors
            states = torch.vstack(states)
            advantage = torch.vstack(advantage)
            v_target = torch.vstack(v_target)
            actions = torch.tensor(actions, dtype=torch.int8, device=device)
            log_probs = torch.tensor(log_probs, dtype=torch.float32, device=device)
            
            # get the new log probabilities and the entropy values
            log_probs_new, entropy = policy.get_prob_and_entropy_from_action(states, actions)
            pt = torch.exp(log_probs_new - log_probs)
            
            # clip loss
            clip_loss = get_clip_loss(pt, advantage, EPSILON)
            
            # loss for critic
            v = policy.get_v(states)
            v_loss = F.mse_loss(v, v_target)
            
            # entropy loss
            h_loss = -torch.mean(entropy)
            
            # total loss
            loss = -clip_loss + v_loss*V_LOSS_FACTOR - h_loss*H_LOSS_FACTOR
            
            # log the losses
            loss_list.append(loss.item())
            loss_list_clip.append(-clip_loss.item())
            loss_list_v.append(v_loss.item()*V_LOSS_FACTOR)
            loss_list_h.append(-h_loss.item()*H_LOSS_FACTOR)
            
            # update the networks
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    # print the results for the episode
    if completed_runs == 0:
        completed_rewards = float('nan')
    else:
        completed_rewards /= completed_runs
    if episode == 0:
        print(f"{'Ep':>4}: {'Reward' :>8} | {'Comp re':>8} | {'loss':>9} | {'loss clip':>9} | {'loss v':>8} | {'loss h':>7} | {'Comp runs':>9} | {'None Comp':<9}") 
    print(f'{episode:>4}: {rewards_sum/(completed_runs+none_completed_runs): 8.02f} | {completed_rewards: 8.02f} | {np.mean(loss_list): 9.02f} | {np.mean(loss_list_clip): 9.02f} | {np.mean(loss_list_v):8.02f} | {np.mean(loss_list_h):7.02f} | {completed_runs:>9} | {none_completed_runs:>9}') 
    
env.close()

In [None]:
env = gym.vector.make("LunarLander-v2", num_envs=PARALLEL_AGENTS)

In [None]:
import os

def save_plot(fig, name, folder=None):
    '''Saves a figure'''
    path = "figures"
    if folder is not None:
        path = os.path.join(path, folder)

    if not os.path.exists(path):
        os.makedirs(path)
    fig.savefig(os.path.join(path, f"{name}.png"), bbox_inches='tight')

def save_model(model, path, name):
    '''Saves torch model'''
    os.makedirs(path, exist_ok = True) 
    torch.save(model, os.path.join(path, name))

save_model(policy, 'models', 'ppo_model.pth')

In [None]:
from gymnasium.wrappers.monitoring import video_recorder
import os

path = 'videos'
os.makedirs(path, exist_ok = True) 

# record video of the result
env = gym.make("LunarLander-v2", render_mode="rgb_array")
video = video_recorder.VideoRecorder(env, path=f'{path}/video_ppo.mp4')
state, _ = env.reset(seed=SEED)
while True:
    video.capture_frame()

    # next action
    with torch.no_grad():
        action = policy.action_prob(torch.tensor(state, device=device))[0].item()

    state, _, terminated, truncated, _ = env.step(action)
    if terminated or truncated: break
video.close()
env.close()

In [None]:
def get_clip_loss(pt, advantage, epsilon):
    return torch.mean(torch.min(advantage * torch.clip(pt, min=1-epsilon, max=1+epsilon),
                                advantage * pt
                               )
                     )

    
for episode in range(100):
    states, _ = env.reset(seed=69)
    
    temp_episode_mem = []
    training_batch = []
    rewards_sum = 0.
    for t in range(MAX_TRAJECTORY):
        # get next action
        states = torch.tensor(states, dtype=torch.float32, device=device)
        with torch.no_grad():
            actions, log_probs = map(lambda x: x.cpu().numpy(), 
                                     policy.action_prob(states),
                                    )
        new_states, rewards, terminated, truncated, _ = env.step(actions)
        temp_episode_mem.append((states, actions, rewards, log_probs))
        
        # hold the rewards for logging
        rewards_sum += sum(rewards)
        
        # check if any agent terminated or we are at the end of the episode
        if terminated.any() or truncated.any() or t == MAX_TRAJECTORY-1:
            done = torch.tensor([x or y for x,y in zip(terminated, truncated)], dtype=torch.float32, device=device).unsqueeze(-1)
            new_states = torch.tensor(new_states, dtype=torch.float32, device=device)
            with torch.no_grad():
                v_target = policy.get_v(new_states)
            v_target = v_target * (1.-done)
            
            for states, actions, rewards, log_probs in reversed(temp_episode_mem):
                rewards = torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(-1)
                v_target = rewards + v_target*GAMMA
                
                with torch.no_grad():
                    v_omega = policy.get_v(states)
                
                advantage = v_target - v_omega
                
                # add the data to the training batch
                training_batch.extend(zip(states, actions, rewards, log_probs, advantage, v_target))
                
            # empty the list
            temp_episode_mem = []
            
            # reset the environment
            states, _ = env.reset(seed=69)
        else:
            states = new_states
    
    # Train the model
    loss_list = []
    loss_list_clip = []
    loss_list_v = []
    loss_list_h = []
    for epoch in range(EPOCHS):
        # shuffle the training data
        random.shuffle(training_batch)
        
        for i in range(0, len(training_batch), BATCH_SIZE):
            batch = training_batch[i:i+BATCH_SIZE]
            states, actions, _, log_probs, advantage, v_target = zip(*batch)
            
            # to tensors
            states = torch.vstack(states)
            advantage = torch.vstack(advantage)
            v_target = torch.vstack(v_target)
            actions = torch.tensor(actions, dtype=torch.int8, device=device)
            log_probs = torch.tensor(log_probs, dtype=torch.float32, device=device)
            
            # get the new log probabilities and the entropy values
            log_probs_new, entropy = policy.get_prob_and_entropy_from_action(states, actions)
            pt = torch.exp(log_probs_new - log_probs)
            
            # clip loss
            clip_loss = get_clip_loss(pt, advantage, EPSILON)
            
            # loss for critic
            v = policy.get_v(states)
            v_loss = F.mse_loss(v, v_target)
            
            # entropy loss
            h_loss = -torch.mean(entropy)
            
            loss = -clip_loss + v_loss*V_LOSS_FACTOR - h_loss*H_LOSS_FACTOR
            loss_list.append(loss.item())
            loss_list_clip.append(-clip_loss.item())
            loss_list_v.append(v_loss.item()*V_LOSS_FACTOR)
            loss_list_h.append(-h_loss.item()*H_LOSS_FACTOR)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    print(f'{episode:>4}: reward {rewards_sum/PARALLEL_AGENTS: 8.02f} | loss: {np.mean(loss_list): 8.02f} | loss clip: {np.mean(loss_list_clip): 8.02f} | loss v: {np.mean(loss_list_v): 8.02f} | loss h: {np.mean(loss_list_h): 7.02f}') 
