Author: Chengyuan Sha

Reference:
Borrowed some ideas such as Open AI's actor-critic normalization tricks:
https://github.com/pytorch/examples/blob/main/reinforcement_learning/actor_critic.py
PPO formular and explanation:
https://jonathan-hui.medium.com/rl-trust-region-policy-optimization-trpo-explained-a6ee04eeeee9
OpenAI blog and github:
https://openai.com/blog/openai-baselines-ppo/
https://github.com/openai/baselines
https://spinningup.openai.com/en/latest/algorithms/ppo.html

In [1]:
import numpy as np
import torch
import torch.nn as nn
import gym

In [2]:
seed = 100
torch.manual_seed(seed)
np.random.seed(seed)

In [3]:
# setting up GPU
try:
    device = torch.device("cuda")
    print("Current using GPU: ")
    print(torch.cuda.get_device_name(device))
except:
    print("GPU not connected. Please check your CUDA and required packages.")

Current using GPU: 
NVIDIA GeForce RTX 3090


In [4]:
class Buffer:
    """ replay buffer keep track of all variables needed for PPO """
    def __init__(self):
        self.actions = []
        self.states = []
        self.log_probability = []
        self.rewards = []
        self.is_finish = []

    def delete_all(self):
        del self.actions[:]
        del self.states[:]
        del self.log_probability[:]
        del self.rewards[:]
        del self.is_finish[:]

def MLP(state_dim, output_dim, hidden_dim = 64):
    """ define MLP for actor critic """
    return nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            )

class AC(nn.Module):
    """ Actor Critic """
    def __init__(self, state_dim, action_dim):
        super(AC, self).__init__()
        self.action_dim = action_dim
        self.action_var = torch.full((action_dim,), 0.3).to(device) # 0.36 is the standard deviation
        self.actor = MLP(state_dim, action_dim)
        self.critic = MLP(state_dim, 1)

    def get_distribution(self, M, covariantM):
        return torch.distributions.MultivariateNormal(M, covariantM)

    def AC_action(self, state):
        distribution = self.get_distribution(self.actor(state), torch.diag(self.action_var).unsqueeze(dim=0))
        action = distribution.sample().detach()
        action_logprob = distribution.log_prob(action).detach()
        return action, action_logprob

    def AC_evaluate(self, state, action):
        action_mean = self.actor(state)
        covariantM = torch.diag_embed(self.action_var.expand_as(action_mean)).to(device)
        distribution = self.get_distribution(action_mean, covariantM)
        return distribution.log_prob(action), self.critic(state), distribution.entropy() # log_probability, state values, entropy


class PPO:
    """ Main PPO class """
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, num_of_epochs):
        self.gamma = gamma
        self.num_of_epochs = num_of_epochs
        self.buffer = Buffer()
        self.policy = AC(state_dim, action_dim).to(device)
        self.optimizer = torch.optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': lr_actor},
            {'params': self.policy.critic.parameters(), 'lr': lr_critic}
        ])
        self.policy_old = AC(state_dim, action_dim).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

    def choose_action(self, state):
        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action, action_logprob = self.policy_old.AC_action(state)

        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.log_probability.append(action_logprob)

        return action.detach().cpu().numpy().flatten()

    @staticmethod
    def normalize(reward):
        """ Reference:  Open AI's actor-critic Trick """
        rewards = torch.tensor(reward, dtype=torch.float32).to(device)
        return (rewards - rewards.mean()) / (rewards.std() + 1e-7)

    def monte_carlo_estimation(self):
        """ monte carlo estimation """
        rewards = []
        gamma_reward = 0
        for reward, is_finish in zip(self.buffer.rewards[::-1], self.buffer.is_finish[::-1]):
            if is_finish:
                gamma_reward = 0
            gamma_reward = reward + (self.gamma * gamma_reward)
            rewards.append(gamma_reward)
        return rewards[::-1]

    @staticmethod
    def get_tensor(tensor):
        return torch.squeeze(torch.stack(tensor, dim=0)).detach().to(device)

    def update_PPO(self):
        rewards = self.monte_carlo_estimation()
        # Open AI's actor-critic Normalizing Trick
        rewards = PPO.normalize(rewards)
        past_states, past_actions, past_logprobs = PPO.get_tensor(self.buffer.states), PPO.get_tensor(self.buffer.actions), PPO.get_tensor(self.buffer.log_probability)
        # Main update loop according to the pseudocode formula
        for _ in range(self.num_of_epochs):
            log_probability, state_values, entropy = self.policy.AC_evaluate(past_states, past_actions)
            state_values = torch.squeeze(state_values)
            ratios = torch.exp(log_probability - past_logprobs.detach())
            # Surrogate Loss calculation
            advantages = rewards - state_values.detach()
            surrogate_loss1 = ratios * advantages
            surrogate_loss2 = torch.clamp(ratios, 0.8, 1.2) * advantages # clip the loss
            # computed the total loss
            loss = -torch.min(surrogate_loss1, surrogate_loss2) + 0.5 * nn.MSELoss()(state_values, rewards) - 0.01 * entropy
            # gradient optimization
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        # cleaning and reinit
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.buffer.delete_all()

    def save_model(self, save_path):
        """ save model to pth file """
        torch.save(self.policy_old.state_dict(), save_path)


Setting hyperparameters & gym environment

In [5]:
# hyper parameters
hp = {'max_ep_len': 1000, 'max_steps': 4000000,
      'action_std':0.6, 'action_std_decay_rate': 0.05,  'min_action_std':0.1,
      'update_step': 4000, 'num_of_epochs': 80,  'gamma': 0.99,
      'lr_actor': 0.0003, 'lr_critic':0.001 }


In [6]:
# creating bipedal walker environment
env = gym.make('BipedalWalker-v3')

In [7]:
# state
state_dim = env.observation_space.shape[0]
# action
action_dim = env.action_space.shape[0]

hp['action_dim'] = action_dim
hp['state_dim'] = state_dim

In [8]:
action_dim

4

Training PPO:

In [6]:
def train(hp):
    # logging file
    file_log = open('PPO_record.csv',"w+")
    file_log.write('episode,step,reward\n')
    # init PPO agent
    ppo = PPO(hp['state_dim'], hp['action_dim'], hp['lr_actor'], hp['lr_critic'], hp['gamma'],
              hp['num_of_epochs'])
    # init recording variables
    reward_logging, episode_logging, step, episode = 0, 0, 0, 0

    # main training loop
    while step <= hp['max_steps']:
        state = env.reset()
        reward_cur_episode = 0
        for _ in range(1, hp['max_ep_len']+1):
            action = ppo.choose_action(state)
            state, reward, is_finish, extra_info = env.step(action)
            # append to record buffer
            ppo.buffer.rewards.append(reward)
            ppo.buffer.is_finish.append(is_finish)
            step +=1
            reward_cur_episode += reward
            if step % hp['update_step'] == 0:
                ppo.update_PPO() # update PPO
            if step % 3000 == 0: # log & print training info
                avg_reward = round(reward_logging / episode_logging, 4)
                to_print = '{},{},{}\n'.format(episode, step, avg_reward)
                print("Episode : {} \t Step : {} \t Average Reward : {}".format(episode, step, avg_reward))
                file_log.write(to_print)
                file_log.flush()
                reward_logging, episode_logging = 0, 0
            if step % 200000 == 0:  # save model to pth
                ppo.save_model("trained_model.pth")
                print("Model Saved!!")
            if is_finish: # die
                break
        reward_logging += reward_cur_episode
        episode_logging += 1
        episode += 1
    # close file and environment
    file_log.close()
    env.close()

train(hp)

Episode : 2 	 Step : 2000 	 Average Reward : -81.1874
Episode : 7 	 Step : 4000 	 Average Reward : -107.533
Episode : 10 	 Step : 6000 	 Average Reward : -112.6325
Episode : 16 	 Step : 8000 	 Average Reward : -87.197
Episode : 23 	 Step : 10000 	 Average Reward : -92.4617
Episode : 27 	 Step : 12000 	 Average Reward : -108.5548
Episode : 31 	 Step : 14000 	 Average Reward : -83.6433
Episode : 35 	 Step : 16000 	 Average Reward : -79.9456
Episode : 38 	 Step : 18000 	 Average Reward : -70.1747
Episode : 42 	 Step : 20000 	 Average Reward : -101.5549
Episode : 46 	 Step : 22000 	 Average Reward : -87.2885
Episode : 48 	 Step : 24000 	 Average Reward : -52.5491
Episode : 50 	 Step : 26000 	 Average Reward : -47.9675
Episode : 53 	 Step : 28000 	 Average Reward : -72.0272
Episode : 57 	 Step : 30000 	 Average Reward : -86.4033
Episode : 61 	 Step : 32000 	 Average Reward : -80.2197
Episode : 64 	 Step : 34000 	 Average Reward : -75.5797
Episode : 66 	 Step : 36000 	 Average Reward : -54.5