In [1]:
import random
import torch, os
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import gymnasium as gym
# import gym
import imageio
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'gymnasium'

In [2]:
SEED = 42
ENV_NAME = "Pendulum-v1"
GAMMA = 0.99
MAX_ENVS = 8
LEARNING_RATE = 3e-4
MAX_STEPS = 128
TOTAL_STEPS = 5_000_000
BATCH_SIZE = MAX_ENVS * MAX_STEPS
NUM_UPDATES = TOTAL_STEPS // BATCH_SIZE
NUM_MINIBATCHES = 4 
NUM_MINIBATCHES_SIZE = BATCH_SIZE // NUM_MINIBATCHES
PPO_EPOCHS = 5
CLIP_VALUE = 0.2
VALUE_COEFF = 0.5
ENTROPY_COEFF = 0.01
LOG_EVERY_N_STEPS = 50

print("Num updates: ", NUM_UPDATES)
print("Batch size: ", BATCH_SIZE)
print("Num Minibatch size:", NUM_MINIBATCHES_SIZE)

Num updates:  4882
Batch size:  1024
Num Minibatch size: 256


In [3]:
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
def make_env(idx, env_name, seed, gamma,eval_mode=None, render_mode=None):
    env = gym.make(env_name, render_mode=render_mode)
    env = gym.wrappers.ClipAction(env)

    if not eval_mode:
        env = gym.wrappers.NormalizeObservation(env)
        obs_space = gym.spaces.Box(low=-10, high=10, shape=env.observation_space.shape, dtype=np.float32)
        env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10), observation_space=obs_space)
        env = gym.wrappers.NormalizeReward(env, gamma=gamma)
        env = gym.wrappers.TransformReward(env, lambda reward: np.clip(reward, -10, 10))
    
    env.action_space.seed(seed + idx)

    return env

In [5]:
envs = gym.vector.SyncVectorEnv(
        [lambda i=i: make_env(i, ENV_NAME, SEED, GAMMA) for i in range(MAX_ENVS)]
    )

In [6]:
observation_space = envs.single_observation_space.shape[0]
action_space = envs.single_action_space.shape[0]
print(f"Action Space: {action_space}")
print(f"Observation Space: {observation_space}")

Action Space: 1
Observation Space: 3


In [7]:
class ActorNet(nn.Module):
    def __init__(self, state_space, action_space):
        super().__init__()
        print(f"State space: {state_space}, action_space: {action_space}")
        self.fc1 = nn.Linear(state_space, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 128)
        self.sigma = nn.Parameter(torch.zeros(1, action_space))
        self.mu = nn.Linear(128, action_space)

    def forward(self, x):
        
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        x = F.tanh(self.fc3(x))
        mu = self.mu(x)
        logvar = self.sigma.expand_as(mu)
        return mu, logvar.exp()
    
    def get_action(self, x):
        mu, sigma= self.forward(x)
        dist = Normal(mu, sigma)  
        action = dist.rsample() 
        log_prob = dist.log_prob(action).sum(-1)
        entropy = dist.entropy().sum(-1)
        return action, log_prob, entropy

    def evaluate_get_action(self, x, act):
        mu, sigma= self.forward(x)
        dist = Normal(mu, sigma)
        log_probs = dist.log_prob(act).sum(-1)
        entropy = dist.entropy().sum(-1)
        return log_probs, entropy

In [8]:
class CriticNet(nn.Module):
    
    def __init__(self, state_space, action_space):
        super(CriticNet, self).__init__()
        print(f"State space: {state_space}, Action space: {action_space}")
        self.fc1 = nn.Linear(state_space, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 256)
        self.value = nn.Linear(256, 1)

    def forward(self, x):
        x = torch.nn.functional.tanh(self.fc1(x))
        x = torch.nn.functional.tanh(self.fc2(x))
        x = torch.nn.functional.tanh(self.fc3(x))
        return self.value(x)

In [9]:
actor_network = ActorNet(observation_space, action_space).to(device)
critic_network = CriticNet(observation_space, action_space).to(device)
optimizer = optim.Adam(list(actor_network.parameters()) + list(critic_network.parameters()), lr=LEARNING_RATE, eps=1e-5)

with torch.no_grad():
    x = torch.randn(MAX_ENVS, observation_space, device=device, dtype=torch.float32)
    test_val = critic_network(x)
    test_act, test_log_probs, test_entropy = actor_network.get_action(x)
    print(test_act.shape)
    print(test_log_probs.shape)
    print(test_entropy.shape)
    print(test_val.shape)

State space: 3, action_space: 1
State space: 3, Action space: 1
torch.Size([8, 1])
torch.Size([8])
torch.Size([8])
torch.Size([8, 1])


In [10]:
def evaluate(envs, model, device, gamma, num_eval_eps=10, record=False, render_mode=None):
    eval_env = make_env(0, ENV_NAME, SEED, gamma, eval_mode=True, render_mode=render_mode)
    eval_env.action_space.seed(SEED)
    
    model = model.to(device)
    model.eval()
    returns = []
    frames = []

    obs_rms = envs.get_attr("obs_rms")[0] 

    model.eval()
    for eps in range(num_eval_eps):
        obs, _ = eval_env.reset()
        done = False
        episode_reward = 0.0
   
        while not done:
            if record:
                frame = eval_env.render()
                frames.append(frame)

            with torch.no_grad():
                norm_obs = np.clip((obs - obs_rms.mean) / np.sqrt(obs_rms.var + 1e-8), -10, 10)
                
                act, _, _ = model.get_action(torch.tensor(norm_obs, device=device, dtype=torch.float32).unsqueeze(0))
                obs, reward, terminated, truncated, _ = eval_env.step(act.cpu().numpy().flatten())
                done = terminated or truncated
                
                episode_reward += reward
                
        returns.append(episode_reward)
    
    eval_env.close()
    model.train()
    return returns, frames

In [11]:
obs_storage = torch.zeros((MAX_STEPS, MAX_ENVS, observation_space)).to(device)
actions_storage = torch.zeros((MAX_STEPS, MAX_ENVS, action_space)).to(device)
logprobs_storage = torch.zeros((MAX_STEPS, MAX_ENVS)).to(device)
rewards_storage = torch.zeros((MAX_STEPS, MAX_ENVS)).to(device)
dones_storage = torch.zeros((MAX_STEPS, MAX_ENVS)).to(device)
values_storage = torch.zeros((MAX_STEPS, MAX_ENVS)).to(device)

In [12]:
next_obs, _ = envs.reset(seed=SEED)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(MAX_ENVS).to(device)
next_obs.shape

torch.Size([8, 3])

In [None]:
for update in range(1, NUM_UPDATES + 1):
    for step in range(0, MAX_STEPS):
        obs_storage[step] = next_obs
        dones_storage[step] = next_done

        with torch.no_grad():
            action, logprob, _ = actor_network.get_action(next_obs)
            value = critic_network(next_obs)
        
        values_storage[step] = value.flatten()
        actions_storage[step] = action
        logprobs_storage[step] = logprob

        next_obs, reward, terminated, truncated, info = envs.step(action.cpu().numpy())
        done = np.logical_or(terminated, truncated)

        rewards_storage[step] = torch.tensor(reward).to(device).view(-1)
        next_obs = torch.Tensor(next_obs).to(device)
        next_done = torch.Tensor(done).to(device)

    
    with torch.no_grad():
        returns = torch.zeros_like(rewards_storage).to(device)
        bootstrap_value = critic_network(next_obs).squeeze()
        gt_next_state = bootstrap_value * (1.0 - next_done)
        for t in reversed(range(MAX_STEPS)):
            rt = rewards_storage[t] + GAMMA * gt_next_state
            returns[t] = rt

            gt_next_state = returns[t] * (1.0 - dones_storage[t])

    advantages = returns - values_storage
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    
    b_obs = obs_storage.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs_storage.reshape(-1)
    b_actions = actions_storage.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)

    b_inds = np.arange(BATCH_SIZE)
    for epoch in range(PPO_EPOCHS):
        np.random.shuffle(b_inds)
        for start in range(0, BATCH_SIZE, NUM_MINIBATCHES_SIZE):
            end = start + NUM_MINIBATCHES_SIZE
            mb_inds = b_inds[start:end]
        
            new_log_probs, entropy = actor_network.evaluate_get_action(b_obs[mb_inds], b_actions[mb_inds])
            ratio = torch.exp(new_log_probs - b_logprobs[mb_inds])
            pg_loss1 = b_advantages[mb_inds] * ratio
            pg_loss2 = b_advantages[mb_inds] * torch.clamp(ratio, 1 - CLIP_VALUE, 1 + CLIP_VALUE)
            policy_loss = -torch.min(pg_loss1, pg_loss2).mean()

            current_values = critic_network(b_obs[mb_inds]).squeeze()
            critic_loss = VALUE_COEFF * torch.nn.functional.smooth_l1_loss(current_values, b_returns[mb_inds])

            entropy_loss = entropy.mean()
            loss = policy_loss - ENTROPY_COEFF *  entropy_loss + critic_loss

            # actor_optim.zero_grad()
            optimizer.zero_grad()
            
            loss.backward()
            nn.utils.clip_grad_norm_(list(actor_network.parameters()) + list(critic_network.parameters()), 1.0)
            optimizer.step()
    
    if update % LOG_EVERY_N_STEPS == 0:
        print(f"[STEP]: {update}, [ACTOR_LOSS]: {policy_loss.item()}, [CRITIC_LOSS]: {critic_loss.item()}, [TOTAL_LOSS]: {loss.item()}, [REWARDS]: {rewards_storage.mean()}")
        train_video_path = f"B:\Pytorch\RL\eval_episodes\ppo_pendulum_{update}.mp4"
        returns, frames = evaluate(envs, actor_network, device, GAMMA, record=True, num_eval_eps=1, render_mode='rgb_array')

        if frames and len(frames) > 0:
            imageio.mimsave(
                train_video_path,
                frames,
                fps=30,
                codec='libx264',
                macro_block_size=1
            )

[STEP]: 50, [ACTOR_LOSS]: -0.01650993525981903, [CRITIC_LOSS]: 0.14201116561889648, [TOTAL_LOSS]: 0.11158302426338196, [REWARDS]: -0.05665622651576996
[STEP]: 100, [ACTOR_LOSS]: -0.0015056682750582695, [CRITIC_LOSS]: 0.1322791874408722, [TOTAL_LOSS]: 0.11683674156665802, [REWARDS]: -0.053673453629016876
[STEP]: 150, [ACTOR_LOSS]: 0.009807728230953217, [CRITIC_LOSS]: 0.23466598987579346, [TOTAL_LOSS]: 0.23061688244342804, [REWARDS]: -0.05489141494035721
[STEP]: 200, [ACTOR_LOSS]: 0.04063758999109268, [CRITIC_LOSS]: 0.23723389208316803, [TOTAL_LOSS]: 0.26442521810531616, [REWARDS]: -0.05328790098428726
[STEP]: 250, [ACTOR_LOSS]: 0.12823787331581116, [CRITIC_LOSS]: 0.21798241138458252, [TOTAL_LOSS]: 0.3329823911190033, [REWARDS]: -0.0516984760761261
[STEP]: 300, [ACTOR_LOSS]: 0.011882031336426735, [CRITIC_LOSS]: 0.16546915471553802, [TOTAL_LOSS]: 0.1644914597272873, [REWARDS]: -0.04353591799736023
[STEP]: 350, [ACTOR_LOSS]: 0.11027967929840088, [CRITIC_LOSS]: 0.04031980782747269, [TOTAL_L

KeyboardInterrupt: 