# PPO

In [23]:
pip install gym=='0.26.2'

Collecting gym==0.26.2
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827617 sha256=7e416e059bd0135e9691f13ac5401f31595a6bbe780517ca4a9b7cdd17564029
  Stored in directory: /root/.cache/pip/wheels/b9/22/6d/3e7b32d98451b4cd9d12417052affbeeeea012955d437da1da
Successfully built gym
Installing collected packages: gym
  Attempting uninstall: gym
    Found existing installation: gym 0.25.2
    Uninstalling gym-0.25.2:
      Successfully uninstalled gym-0.25.2
[31mERROR: pip's dependency resolver does not currently take into account 

In [1]:
import numpy as np
import torch
import torch as th
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torch.nn.functional as F
from torch.distributions import Categorical
from torch import nn
import scipy.stats as st
import itertools
import random
import gym
import torch.nn.init as init

In [2]:
print(gym.__version__)

0.26.2


# Environment

In [9]:
class CartPoleEnv:
    def __init__(self, env_name):
        self.env_name = env_name
        self.env = gym.make(env_name)

    def reset(self):
        return self.env.reset()

    def step(self, action):
        return self.env.step(action)

    def close(self):
        self.env.close()

    def pre_process(self, state, _):
        return th.FloatTensor(state).unsqueeze(0)

# Actor

In [17]:

class Actor(nn.Module):
    def __init__(self, layers1_num, layers2_num, out_num):
        super(Actor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(layers1_num, layers2_num), nn.ReLU(),
            nn.Linear(layers2_num, out_num)
        )

    def forward(self, d_obs, deterministic=False):
        logits = self.layers(d_obs)
        if deterministic:
            action = int(torch.argmax(logits[0]).detach().cpu().numpy())
            action_prob = 1.0
        else:
            c = torch.distributions.Categorical(logits=logits)
            action = int(c.sample().cpu().numpy()[0])
            action_prob = float(c.probs[0, action].detach().cpu().numpy())
        return action, action_prob

    def convert_action(self, action, env_name):
        if env_name == 'Pong-v0':
            return action + 2
        else:
            return action  # No need to adjust for other environments

    def ppo_loss(self, d_obs, action, action_prob, advantage, eps_clip):
        vs = np.array([[1., 0.], [0., 1.]])  # TODO: Adjust according to your use case
        ts = torch.FloatTensor(vs[action.cpu().numpy()])

        logits = self.layers(d_obs)
        r = torch.sum(F.softmax(logits, dim=1) * ts, dim=1) / action_prob
        loss1 = r * advantage
        loss2 = torch.clamp(r, 1 - eps_clip, 1 + eps_clip) * advantage
        loss = -torch.min(loss1, loss2)
        loss = torch.mean(loss)

        return loss

# Critic

In [12]:
class Critic(nn.Module):
    def __init__(self, layers1_num, layers2_num):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
            nn.Linear(layers1_num, layers2_num),
            nn.Tanh(),
            nn.Linear(layers2_num, layers2_num),
            nn.Tanh(),
            nn.Linear(layers2_num, 1)
        )

    def forward(self, state):
        state_value = self.critic(state)
        return state_value

    def critic_loss(self, state_val, discounted_rewards):
        loss = nn.MSELoss(reduction='mean')
        loss = loss(state_val, discounted_rewards)
        return loss

# Train Loop

In [13]:
class PPOtrainer:
    def __init__(self, env_name):
        self.env_name = env_name

    def train(self, env, actor, critic, nb_episodes, batch_size):
        tester = Figure()  # Assuming Figure is defined elsewhere
        params = ParametersPPO()
        opt = th.optim.Adam([
            {'params': actor.parameters(), 'lr': params.lr},
            {'params': critic.parameters(), 'lr': params.lr_c}
        ])
        reward_sum_running_avg = None
        reward_sum_running_avg_history = []
        training_results = []
        test_results = []

        for it in range(nb_episodes):
            d_obs_history, action_history, action_prob_history, reward_history = [], [], [], []
            state_val_history = []
            done_history = []
            episode_rewards = 0

            for ep in range(params.ep):
                obs, prev_obs = env.reset(), None
                obs = obs[0]
                for t in range(params.t):
                    d_obs = env.pre_process(obs, prev_obs)

                    with th.no_grad():
                        action, action_prob = actor(d_obs)

                    state_val = critic(d_obs)
                    prev_obs = obs
                    obs, reward, done, truncated, _ = env.step(actor.convert_action(action, self.env_name))

                    d_obs_history.append(d_obs)
                    action_history.append(action)
                    action_prob_history.append(action_prob)
                    reward_history.append(reward)
                    state_val_history.append(state_val)
                    done_history.append(done)

                    episode_rewards += reward

                    if done:
                        reward_sum = sum(reward_history[-t:])
                        reward_sum_running_avg = 0.99 * reward_sum_running_avg + 0.01 * reward_sum if reward_sum_running_avg else reward_sum
                        reward_sum_running_avg_history.append(reward_sum_running_avg)
                        break

            training_results.append(episode_rewards / params.ep)  # Average reward per episode

            # Compute advantage
            R = 0
            discounted_rewards = []

            for r, d in zip(reward_history[::-1], done_history[::-1]):
                if self.env_name == 'Pong-v0' and r != 0:
                    R = 0  # Scored/lost a point in pong, so reset reward sum
                if d is True:
                    R = 0  # If terminal, R=0
                R = r + params.gamma * R
                discounted_rewards.insert(0, R)

            # Normalizing the rewards
            discounted_rewards = th.FloatTensor(discounted_rewards)
            discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / discounted_rewards.std()
            assert len(discounted_rewards) == len(state_val_history)
            advantage_history = []
            for i_adv in range(len(discounted_rewards)):
                adv = discounted_rewards[i_adv] - state_val_history[i_adv]
                advantage_history.append(adv)
            assert len(advantage_history) == len(discounted_rewards)

            # Update policy
            for _ in range(params.training_times):
                idxs = random.sample(range(len(action_history)), batch_size)
                d_obs_batch = th.cat([d_obs_history[idx] for idx in idxs], 0)
                action_batch = th.LongTensor([action_history[idx] for idx in idxs])
                action_prob_batch = th.FloatTensor([action_prob_history[idx] for idx in idxs])
                advantage_batch = th.FloatTensor([advantage_history[idx] for idx in idxs])
                state_val_batch = th.FloatTensor([state_val_history[idx] for idx in idxs])
                discounted_rewards_batch = th.FloatTensor([discounted_rewards[idx] for idx in idxs])

                opt.zero_grad()
                loss_a = actor.ppo_loss(d_obs_batch, action_batch, action_prob_batch, advantage_batch, params.eps_clip)
                loss_c = critic.critic_loss(state_val_batch, discounted_rewards_batch)
                loss = loss_a + loss_c
                loss.backward()
                opt.step()

            if it % params.test_interval == 0:
                # Test 10 times for more accurate results
                test_sum = 0
                for test_i in range(params.test_trials):
                    test_reward = tester.test(env, actor, self.env_name)
                    test_sum += test_reward
                test_average = test_sum / params.test_trials
                test_results.append(test_average)
                print('Training reward for episode %d: %.2f' % (it, test_average))

            if it % params.save_episode == 0:
                if it == 0:
                    th.save({'actor': actor.state_dict(), 'critic': critic.state_dict()}, 'params.ckpt')
                else:
                    if test_average >= max(test_results[:-1]):
                        th.save({'actor': actor.state_dict(), 'critic': critic.state_dict()}, 'params.ckpt')

        return training_results, test_results

# Test Loop

In [14]:
class Figure:
    def test(self, env, agent, env_name):
        obs, prev_obs = env.reset(), None
        obs = obs[0]
        reward_sum = 0
        reward_history = []
        params = ParametersPPO()
        for t in range(params.t):
            d_obs = env.pre_process(obs, prev_obs)

            with th.no_grad():
                action, action_prob = agent(d_obs)

            prev_obs = obs
            obs, reward, done, truncated, _ = env.step(agent.convert_action(action, env_name))

            reward_sum += reward
            reward_history.append(reward)

            if done:
                break

        return reward_sum

# Utilities

In [15]:
class Utils:
    def __init__(self):
        pass

    def benchmark_plot(self, all_train_returns, all_test_returns, test_interval, moving_avg_window=100, down_sample_factor=100):
        num_trials = len(all_train_returns)
        num_points = len(all_test_returns[0])

        # Convert lists to numpy arrays for easier calculations
        all_train_returns = np.array(all_train_returns)
        all_test_returns = np.array(all_test_returns)

        # Calculate the mean and 95% confidence intervals
        mean_train_returns = all_train_returns.mean(axis=0)
        mean_test_returns = all_test_returns.mean(axis=0)

        train_ci = 1.96 * all_train_returns.std(axis=0) / np.sqrt(num_trials)
        test_ci = 1.96 * all_test_returns.std(axis=0) / np.sqrt(num_trials)

        # Calculate individual maximum returns from each trial
        individual_max_returns = [np.max(trial_returns) for trial_returns in all_test_returns]

        # Calculate the average maximum return
        avg_max_return = np.mean(individual_max_returns)

        # Calculate the 95% confidence interval for the average maximum return
        n = len(individual_max_returns)
        sample_std = np.std(individual_max_returns, ddof=1)
        t_value = st.t.ppf(1 - 0.025, df=n - 1)
        margin_of_error = t_value * sample_std / np.sqrt(n)
        avg_max_return_ci = margin_of_error

        # Apply moving average to smooth the training returns
        smoothed_mean_train_returns = np.convolve(mean_train_returns, np.ones(moving_avg_window) / moving_avg_window, mode='valid')
        smoothed_train_ci = np.convolve(train_ci, np.ones(moving_avg_window) / moving_avg_window, mode='valid')

        # Down-sample the training returns for plotting
        down_sampled_indices = np.arange(0, len(smoothed_mean_train_returns), down_sample_factor)
        down_sampled_mean_train_returns = smoothed_mean_train_returns[down_sampled_indices]
        down_sampled_train_ci = smoothed_train_ci[down_sampled_indices]

        # Plot training returns with moving average and confidence interval
        plt.figure(figsize=(12, 6))
        plt.plot(down_sampled_indices, down_sampled_mean_train_returns, label='Mean Training Returns (Smoothed)', color='blue')
        plt.fill_between(down_sampled_indices, down_sampled_mean_train_returns - down_sampled_train_ci, down_sampled_mean_train_returns + down_sampled_train_ci, color='lightblue', alpha=0.3, label='CI')
        plt.xlabel('Episodes')
        plt.ylabel('Training Return')
        plt.title('Training Returns with 95% Confidence Interval (Smoothed)')
        plt.legend()
        plt.show()

        # Plot density plot of training returns
        plt.figure(figsize=(12, 6))
        #sns.kdeplot(mean_train_returns, fill=True, label='Density Plot')
        sns.kdeplot(mean_train_returns, label='Density Plot')
        plt.xlabel('Training Return')
        plt.ylabel('Density')
        plt.title('Density Plot of Training Returns')
        plt.legend()
        plt.show()

        # Plot test returns
        plt.figure(figsize=(12, 6))
        episodes = np.arange(0, num_points * test_interval, test_interval)
        for i in range(num_trials):
            plt.plot(episodes, all_test_returns[i], linestyle='dotted', alpha=0.5, label=f'Trial {i+1}')  # Individual test trials
        plt.plot(episodes, mean_test_returns, '-o', label='Mean Test Returns', color='black')  # Mean test returns without error bars
        plt.fill_between(episodes, mean_test_returns - test_ci, mean_test_returns + test_ci, color='lightblue', alpha=0.3, label='CI')  # Fill between upper and lower bounds
        plt.xlabel('Episodes')
        plt.ylabel('Test Return')
        plt.title('Test Returns with 95% Confidence Interval')
        plt.legend()
        plt.show()

        # Plot density plot of test returns
        plt.figure(figsize=(12, 6))
        #sns.kdeplot(mean_test_returns, fill=True, label='Density Plot')
        sns.kdeplot(mean_test_returns, label='Density Plot')
        plt.xlabel('Test Return')
        plt.ylabel('Density')
        plt.title('Density Plot of Test Returns')
        plt.legend()
        plt.show()

        return mean_test_returns, avg_max_return, avg_max_return_ci, individual_max_returns

# Main

In [19]:
class ParametersPPO:
    def __init__(self):
        self.nb_episodes = 1000  # or 500
        self.batch_size = 128  # or 64
        self.gamma = 0.99
        self.eps_clip = 0.2
        self.layers1_num = 4  # CartPole state space dimension
        self.layers2_num = 64  # or 128, hidden_layer
        self.out_num = 2  # CartPole action space dimension
        self.lr = 3e-4  # learning rate for actor network
        self.lr_c = 0.001  # learning rate for critic network
        self.ep = 10
        self.t = 500  # or 1000 the max time steps
        self.training_times = 10
        self.save_episode = 50
        self.test_episode = 25
        self.test_trials = 10  # test 10 times and get the average result
        self.test_interval = 10  # test every 10 episodes
        self.num_trials = 5

class PPOrunner():
    def __init__(self, env):
        self.env = env

    def run_experiment(self):
        params = ParametersPPO()
        nb_episodes = params.nb_episodes
        batch_size = params.batch_size
        num_trials = params.num_trials

        # Load_save_result = params.Load_save_result
        all_train_returns = []
        all_test_returns = []

        for trial in range(num_trials):
            print(f"Trial: {trial+1}")
            actor_ppo = Actor(params.layers1_num, params.layers2_num, params.out_num)
            critic_ppo = Critic(params.layers1_num, params.layers2_num)
            trainer_ppo = PPOtrainer(self.env.env_name)

            train_rewards, test_rewards = trainer_ppo.train(self.env, actor_ppo, critic_ppo, nb_episodes, batch_size)
            all_train_returns.append(train_rewards)
            all_test_returns.append(test_rewards)

        utils = Utils()
        average_returns, max_return, max_return_ci, individual_returns = utils.benchmark_plot(all_train_returns, all_test_returns, params.test_interval)
        print(f"Average Return: {average_returns}")
        print(f"Max Return: {max_return}")
        print(f"Max Return 95% CI: {max_return_ci}")
        print(f"Individual Returns: {individual_returns}")
        print("Completed experiment")

def main():
    env = CartPoleEnv('CartPole-v0')
    runner = PPOrunner(env)
    runner.run_experiment()

if __name__ == '__main__':
    main()

Trial: 1
Training reward for episode 0: 26.30
Training reward for episode 10: 30.00
Training reward for episode 20: 47.60
Training reward for episode 30: 41.20
Training reward for episode 40: 61.80
Training reward for episode 50: 99.60
Training reward for episode 60: 90.30
Training reward for episode 70: 103.00
Training reward for episode 80: 83.10
Training reward for episode 90: 163.30
Training reward for episode 100: 204.50
Training reward for episode 110: 221.40
Training reward for episode 120: 198.40
Training reward for episode 130: 276.90
Training reward for episode 140: 224.40
Training reward for episode 150: 270.60
Training reward for episode 160: 292.20
Training reward for episode 170: 342.70
Training reward for episode 180: 392.10
Training reward for episode 190: 257.10
Training reward for episode 200: 297.60
Training reward for episode 210: 385.80
Training reward for episode 220: 403.30
Training reward for episode 230: 455.90
Training reward for episode 240: 451.40
Training r

KeyboardInterrupt: 