In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# HyperParameters

LEARNING_RATE = 0.0003
GAMMA = 0.99
EPSILON_CLIP = 0.2
ENTROPY_COEFF = 0.01
EPOCHS = 10
BATCH_SIZE = 64
TIMESTEPS = 2048

In [3]:
# Action Critic Network

class PPOActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PPOActorCritic, self).__init__()
    
        # actor
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim = -1)
        )

        # critic
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )
    
    def forward(self):
        raise NotImplementedError

    def get_action_and_value(self, state):

        action_probs = self.actor(state)  # pi(a|s) => left|right => ex : [0.7, 0.3]
        state_values = self.critic(state) # v(s), R + rV(s') - "V(s)"

        dist = Categorical(action_probs)
        action = dist.sample() # left|right = [0.75, 0.25] => 0번 선택
        action_logprobs = dist.log_prob(action) # log(0.75)
        entropy = dist.entropy() # entropy([0.75, 0.25])

        return action, action_logprobs, state_values, entropy

In [4]:
class RolloutBuffer():
    def __init__(self):
        self.actions = []
        self.states = []
        self.log_probs = []
        self.rewards = []
        self.state_values = []
        self.dones = []

    def clear(self):
        self.actions = []
        self.states = []
        self.log_probs = []
        self.rewards = []
        self.state_values = []
        self.dones = []

In [5]:
def train_ppo(buffer, old_model, new_model, optimizer):
    # calculate last state_value for : r + gamma * V(t)
    state = buffer.states[-1]
    done = buffer.dones[-1]
    with torch.no_grad():
        discounted_rewards = 0 if done else old_model.get_action_and_value(torch.FloatTensor(state))

    returns = []
    for reward in reversed(buffer.rewards):
        discounted_rewards = reward + GAMMA * discounted_rewards
        returns.insert(0, discounted_rewards)

    advantages = torch.FloatTensor(returns) - torch.FloatTensor(buffer.state_values)

    for _ in range(EPOCHS):
        for idx in range(0, len(buffer.states), BATCH_SIZE):
            batch_states = torch.FloatTensor(buffer.states[idx : idx + BATCH_SIZE])
            batch_actions = torch.LongTensor(buffer.actions[idx : idx + BATCH_SIZE])

            batch_returns = torch.FloatTensor(returns[idx : idx + BATCH_SIZE])
            batch_advantages = torch.FloatTensor(advantages[idx : idx + BATCH_SIZE])

            # new_model에서 새로운 정책 계산
            new_policy_logits = new_model.actor(batch_states)
            values = new_model.critic(batch_states)
            new_policy_dist = Categorical(logits = new_policy_logits)
            new_log_probs = new_policy_dist.log_prob(batch_actions)
            entropy = new_policy_dist.entropy()

            # old_model에서 이전 정책 계산
            with torch.no_grad():
                old_policy_logits = old_model.actor(batch_states)
                old_policy_dist = Categorical(logits = old_policy_logits)
                old_log_probs = old_policy_dist.log_prob(batch_actions)

            # Compute ratio
            ratios = torch.exp(new_log_probs - old_log_probs)

            # PPO Loss
            surrogate1 = ratios * batch_advantages
            surrogate2 = torch.clamp(ratios, 1 - EPSILON_CLIP, 1 + EPSILON_CLIP) * batch_advantages
            policy_loss = -torch.min(surrogate1, surrogate2).mean()

            value_loss = nn.MSELoss()(values.squeeze(), batch_returns)

            entropy_loss = - ENTROPY_COEFF * entropy.mean()

            loss = policy_loss + value_loss + entropy_loss

            # Optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [14]:
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

old_model = PPOActorCritic(state_dim, action_dim)
new_model = PPOActorCritic(state_dim, action_dim)

new_model.load_state_dict(old_model.state_dict()) # Synchronize models initially
optimizer = optim.Adam([
    {'params' : new_model.actor.parameters(), 'lr' : LEARNING_RATE},
    {'params' : new_model.critic.parameters(), 'lr' : LEARNING_RATE}
])

buffer = RolloutBuffer()
buffer.clear()

for episode in range(1000):
    state, _ = env.reset()
    state = torch.FloatTensor(state)
    episode_reward = 0

    buffer.clear()

    for t in range(TIMESTEPS):
        with torch.no_grad():
            action, log_prob, value, _ = old_model.get_action_and_value(state)
            next_state, reward, done, _, _ = env.step(action.item())

        # Store data
        buffer.states.append(state.numpy())
        buffer.actions.append(action.item())
        buffer.log_probs.append(log_prob.item())
        buffer.rewards.append(reward)
        buffer.state_values.append(value.item())
        buffer.dones.append(done)

        state = torch.FloatTensor(next_state)
        episode_reward += reward

        if done:
            state, _ = env.reset()
            state = torch.FloatTensor(state)
            break

    # Train PPO
    train_ppo(buffer, old_model, new_model, optimizer)
    old_model.load_state_dict(new_model.state_dict()) # Update old_model to match new_model

    print(f"Episode {episode} - Reward: {episode_reward}")

    if (episode_reward > 1000):
        break


Episode 0 - Reward: 22.0
Episode 1 - Reward: 34.0
Episode 2 - Reward: 24.0
Episode 3 - Reward: 36.0
Episode 4 - Reward: 19.0
Episode 5 - Reward: 48.0
Episode 6 - Reward: 55.0
Episode 7 - Reward: 25.0
Episode 8 - Reward: 11.0
Episode 9 - Reward: 86.0
Episode 10 - Reward: 29.0
Episode 11 - Reward: 24.0
Episode 12 - Reward: 45.0
Episode 13 - Reward: 28.0
Episode 14 - Reward: 55.0
Episode 15 - Reward: 107.0
Episode 16 - Reward: 44.0
Episode 17 - Reward: 84.0
Episode 18 - Reward: 55.0
Episode 19 - Reward: 36.0
Episode 20 - Reward: 48.0
Episode 21 - Reward: 33.0
Episode 22 - Reward: 23.0
Episode 23 - Reward: 47.0
Episode 24 - Reward: 31.0
Episode 25 - Reward: 66.0
Episode 26 - Reward: 66.0
Episode 27 - Reward: 52.0
Episode 28 - Reward: 57.0
Episode 29 - Reward: 56.0
Episode 30 - Reward: 59.0
Episode 31 - Reward: 35.0
Episode 32 - Reward: 58.0
Episode 33 - Reward: 36.0
Episode 34 - Reward: 33.0
Episode 35 - Reward: 83.0
Episode 36 - Reward: 39.0
Episode 37 - Reward: 46.0
Episode 38 - Reward: 

  return F.mse_loss(input, target, reduction=self.reduction)


Episode 269 - Reward: 55.0
Episode 270 - Reward: 75.0
Episode 271 - Reward: 51.0
Episode 272 - Reward: 46.0
Episode 273 - Reward: 54.0
Episode 274 - Reward: 98.0
Episode 275 - Reward: 52.0
Episode 276 - Reward: 56.0
Episode 277 - Reward: 58.0
Episode 278 - Reward: 56.0
Episode 279 - Reward: 51.0
Episode 280 - Reward: 38.0
Episode 281 - Reward: 58.0
Episode 282 - Reward: 37.0
Episode 283 - Reward: 40.0
Episode 284 - Reward: 46.0
Episode 285 - Reward: 99.0
Episode 286 - Reward: 62.0
Episode 287 - Reward: 42.0
Episode 288 - Reward: 66.0
Episode 289 - Reward: 56.0
Episode 290 - Reward: 38.0
Episode 291 - Reward: 54.0
Episode 292 - Reward: 80.0
Episode 293 - Reward: 100.0
Episode 294 - Reward: 93.0
Episode 295 - Reward: 134.0
Episode 296 - Reward: 92.0
Episode 297 - Reward: 109.0
Episode 298 - Reward: 71.0
Episode 299 - Reward: 83.0
Episode 300 - Reward: 81.0
Episode 301 - Reward: 79.0
Episode 302 - Reward: 46.0
Episode 303 - Reward: 60.0
Episode 304 - Reward: 46.0
Episode 305 - Reward: 208

TypeError: can't multiply sequence by non-int of type 'float'

In [15]:
import time
env = gym.make("CartPole-v1", render_mode = "human")
state, info = env.reset()

for i in range(500):
    s = torch.from_numpy(state).float().unsqueeze(0)
    action, lp, m, entropy = old_model.get_action_and_value(s)
    state, reward, done, truncated, _ = env.step(action.item())

    env.render()

    time.sleep(0.01)

    if done:
        state, info = env.reset()

env.close()

In [16]:
import time
max_ep_len = 300

total_test_episodes = 5
test_running_reward = 0

env = gym.make("CartPole-v1", render_mode = "human")

for episode in range(1, total_test_episodes+1):
    state, _ = env.reset()
    ep_reward = 0

    for t in range(max_ep_len):
        action_probs = new_model.actor(torch.FloatTensor(state))
        dist = Categorical(action_probs)
        action = dist.sample()

        state, reward, done, truncated, _ = env.step(action.item())
        ep_reward += reward

        env.render()
        time.sleep(0.01)

        if done:
            state, info = env.reset()
    
    test_running_reward += ep_reward
    print(f"Episode : {episode} \t\t Reward : {round(ep_reward, 2)}")
    ep_reward = 0

env.close()

Episode : 1 		 Reward : 300.0
Episode : 2 		 Reward : 300.0
Episode : 3 		 Reward : 300.0
Episode : 4 		 Reward : 300.0
Episode : 5 		 Reward : 300.0
