PPO MODEL

In [22]:
pip install --upgrade gym

Note: you may need to restart the kernel to use updated packages.


In [27]:
!pip3 install torchrl

Collecting torchrl
  Downloading torchrl-0.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting tensordict>=0.5.0 (from torchrl)
  Downloading tensordict-0.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (22 kB)
Collecting orjson (from tensordict>=0.5.0->torchrl)
  Downloading orjson-3.10.7-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading torchrl-0.5.0-cp312-cp312-macosx_11_0_arm64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tensordict-0.5.0-cp312-cp312-macosx_11_0_arm64.whl (648 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.4/648.4 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading orjson-3.10.7-cp312-cp312-macosx_10_15_x86

In [11]:
pip install mujoco

Note: you may need to restart the kernel to use updated packages.


In [1]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import nn
from torch import optim
from torch.distributions.categorical import Categorical
import random

sns.set()

In [2]:
class ActorCriticNN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()

        self.shared_layers = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU())

        self.policy_layers = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64,action_dim))

        self.value_layers = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1))

    def value(self, obs):
        z = self.shared_layers(obs)
        value = self.value_layers(z)
        return value

    def policy(self, obs):
        z = self.shared_layers(obs)
        policy_logits = self.policy_layers(z)
        return policy_logits

    def forward(self, obs):
        z = self.shared_layers(obs)
        policy_logits = self.policy_layers(z)
        value = self.value_layers(z)
        return policy_logits, value

In [3]:
#trainer
class PPOTrainer():
    def __init__(self, actor_critic, ppo_clip_val = 0.2, target_kl_div = 0.01, max_policy_train_iters = 80, value_train_iters=80, 
                policy_lr = 3e-4, value_lr = 1e-2):
        self.ac = actor_critic
        self.ppo_clip_val = ppo_clip_val
        self.target_kl_div = target_kl_div
        self.max_policy_train_iters = max_policy_train_iters
        self.value_train_iters = value_train_iters
    
        policy_params = list(self.ac.shared_layers.parameters()) + \
            list(self.ac.policy_layers.parameters())
        self.policy_optim = optim.Adam(policy_params, lr = policy_lr)
    
        value_params = list(self.ac.shared_layers.parameters()) + \
            list(self.ac.value_layers.parameters())
        self.value_optim = optim.Adam(value_params, lr = value_lr)

    def train_policy(self, obs, acts, old_log_probs, gaes):

        for _ in range(self.max_policy_train_iters):
                
            self.policy_optim.zero_grad()
    
            new_logits = self.ac.policy(obs)
            new_logits = Categorical(logits = new_logits)
            new_log_probs = new_logits.log_prob(acts)
    
            policy_ratio = torch.exp(new_log_probs - old_log_probs)
            clipped_ratio = policy_ratio.clamp(1 - self.ppo_clip_val, 1 + self.ppo_clip_val)
            clipped_loss = clipped_ratio * gaes
            full_loss = policy_ratio * gaes
            policy_loss = -torch.min(full_loss, clipped_loss).mean()
    
            policy_loss.backward()
            self.policy_optim.step()

            kl_div = (old_log_probs - new_log_probs).mean()
            if kl_div >= self.target_kl_div:
                break

    def train_value(self, obs, returns):

        for _ in range (self.value_train_iters):
            self.value_optim.zero_grad()
    
            values = self.ac.value(obs)
            value_loss = (returns-values) ** 2
            value_loss = value_loss.mean()
    
            value_loss.backward()
            self.value_optim.step()

In [4]:
def discount_rewards(rewards, gamma = 0.99):
    """
    Return discounted rewards based on the given rewards and the gamma param.
    """

    new_rewards = [float(rewards[-1])]
    for i in reversed(range(len(rewards)-1)):
        new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
    return np.array(new_rewards[::-1])

def calculate_gaes(rewards, values, gamma = 0.99, decay = 0.97):
    """
    Return the General Advantage Estimates from the given rewards and values.
    Paper: https://arxiv.org/abs/1506.02438
    """

    next_values = np.concatenate([values[1:], [0]])
    deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]

    gaes = [deltas[-1]]
    for i in reversed(range(len(deltas)-1)):
        gaes.append(deltas[i] + decay * gamma * gaes[-1])

    return np.array(gaes[::-1])

In [5]:
def rollout(model, env, max_steps=1000):
    """
    Performs a single rollout, sampling an action and collecting data.
    Returns training data in the shape (n_steps, observation_shape) and the cumulative reward.
    """

    # Create the data storage (obs, act, reward, values, act_log_probs)
    train_data = [[], [], [], [], []]  
    obs, _ = env.reset()

    #print("Environment reset, initial observation:", obs)  # Check if env.reset() works

    ep_reward = 0
    for step in range(max_steps):

        #Convert observation to a PyTorch tensor and ensure correct shape
        obs = torch.tensor(obs, dtype=torch.float32)

        logits, val = model(obs)

        # Create action distribution and sample an action
        act_dist = Categorical(logits=logits)
        act = act_dist.sample()
        act_log_prob = act_dist.log_prob(act).item()

        act, val = act.item(), val.item()

        # Take action in the environment
        next_obs, reward, done, _, __ = env.step(act)

        # Store the data for training
        for i, item in enumerate((obs, act, reward, val, act_log_prob)):
            train_data[i].append(item)

        #print(f"Action taken: {act.item()}, Reward received: {reward}, Done: {done}")

        # Update observation and cumulative reward
        obs = next_obs
        ep_reward += reward

        # Break if the episode ends
        if done:
            #print(f"Episode ended after {step+1} steps with cumulative reward: {ep_reward}")
            break


    train_data = [np.asarray(x) for x in train_data]
    
    train_data[3] = calculate_gaes(train_data[2], train_data[3])
    
    return train_data, ep_reward


In [6]:
env = gym.make('CartPole-v1')
model = ActorCriticNN(env.observation_space.shape[0], env.action_space.n)
model = model.to('cpu')
train_data, reward = rollout(model, env)

  if not isinstance(terminated, (bool, np.bool8)):


In [13]:
#Define Training Params
n_episodes = 200
print_freq = 10

ppo = PPOTrainer(model, policy_lr = 3e-4, value_lr = 1e-3, target_kl_div = 0.02, max_policy_train_iters = 40, value_train_iters = 40)

In [15]:
#Training Loops
ep_rewards = []
for episode_idx in range (n_episodes):
    #Perform rollout
    train_data, reward = rollout(model, env)
    ep_rewards.append(reward)

    permute_idxs = np.random.permutation(len(train_data[0]))
    obs = torch.tensor(train_data[0][permute_idxs], dtype = torch.float32)
    act = torch.tensor(train_data[1][permute_idxs], dtype = torch.int32)
    gaes = torch.tensor(train_data[3][permute_idxs], dtype = torch.float32)
    act_log_probs = torch.tensor(train_data[4][permute_idxs], dtype = torch.float32)

    #Value Data
    returns = discount_rewards(train_data[2])[permute_idxs]
    returns = torch.tensor(returns, dtype = torch.float32)

    #Train Policy
    ppo.train_policy(obs, act, act_log_probs, gaes)
    ppo.train_value(obs, returns)

    if(episode_idx + 1) % print_freq == 0:
        print('Episode {} | Avg Reward {:.1f}'.format(
            episode_idx + 1, np.mean(ep_rewards[-print_freq])))

Episode 10 | Avg Reward 16.0
Episode 20 | Avg Reward 14.0
Episode 30 | Avg Reward 29.0
Episode 40 | Avg Reward 25.0
Episode 50 | Avg Reward 58.0
Episode 60 | Avg Reward 27.0
Episode 70 | Avg Reward 183.0
Episode 80 | Avg Reward 37.0
Episode 90 | Avg Reward 110.0
Episode 100 | Avg Reward 113.0
Episode 110 | Avg Reward 101.0
Episode 120 | Avg Reward 111.0
Episode 130 | Avg Reward 233.0
Episode 140 | Avg Reward 175.0
Episode 150 | Avg Reward 203.0
Episode 160 | Avg Reward 190.0
Episode 170 | Avg Reward 243.0
Episode 180 | Avg Reward 409.0
Episode 190 | Avg Reward 524.0
Episode 200 | Avg Reward 1000.0
