In [62]:
import pybullet_envs
# Don't forget to install PyBullet!
from gym import make
import numpy as np
import torch
from torch import nn
from torch.distributions import Normal
from torch.nn import functional as F
from torch.optim import Adam
import random
from tqdm.notebook import tqdm

In [None]:
if __name__ == "__main__":
    env = make(ENV_NAME)
    ppo = PPO(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0])
    state = env.reset()
    episodes_sampled = 0
    steps_sampled = 0
    
    for i in range(ITERATIONS):
        trajectories = []
        steps_ctn = 0
        
        while len(trajectories) < MIN_EPISODES_PER_UPDATE or steps_ctn < MIN_TRANSITIONS_PER_UPDATE:
            traj = sample_episode(env, ppo)
            steps_ctn += len(traj)
            trajectories.append(traj)
        episodes_sampled += len(trajectories)
        steps_sampled += steps_ctn

        ppo.update(trajectories)        
        
        if (i + 1) % (ITERATIONS//100) == 0:
            rewards = evaluate_policy(env, ppo, 5)
            print(f"Step: {i+1}, Reward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}, Episodes: {episodes_sampled}, Steps: {steps_sampled}")
            ppo.save()

In [2]:
ENV_NAME = "Walker2DBulletEnv-v0"

LAMBDA = 0.95
GAMMA = 0.99

ACTOR_LR = 2e-4
CRITIC_LR = 1e-4

CLIP = 0.2
ENTROPY_COEF = 1e-2
BATCHES_PER_UPDATE = 64
BATCH_SIZE = 64

MIN_TRANSITIONS_PER_UPDATE = 2048
MIN_EPISODES_PER_UPDATE = 4

ITERATIONS = 1000

In [3]:
def compute_lambda_returns_and_gae(trajectory):
    lambda_returns = []
    gae = []
    last_lr = 0.
    last_v = 0.
    for _, _, r, _, v in reversed(trajectory):
        ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA)
        last_lr = ret
        last_v = v
        lambda_returns.append(last_lr)
        gae.append(last_lr - v)
    
    # Each transition contains state, action, old action probability, value estimation and advantage estimation
    return [(s, a, p, v, adv) for (s, a, _, p, _), v, adv in zip(trajectory, reversed(lambda_returns), reversed(gae))]

In [None]:


    def get_action_distribution(self, state):
        mu, log_sigma = torch.chunk(self.model(state), 2, dim=-1)
        sigma = torch.exp(log_sigma)
        return Normal(mu, sigma) # batch_size x action_size

    def get_logprob(self, state, action):
        distrib = self.get_action_distribution(state)
        return distrib.log_prob(action).sum(-1)

In [165]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=256):
        super().__init__()
        # Advice: use same log_sigma for all states to improve stability
        # You can do this by defining log_sigma as nn.Parameter(torch.zeros(...))
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, 2 * action_dim)
        )
        #self.sigma = torch.zeros(action_dim)
        self.sigma = nn.Parameter(torch.zeros(action_dim))
        
    def compute_proba(self, state, action):
        # Returns probability of action according to current policy and distribution of actions
        #with torch.no_grad():
            #action = torch.Tensor([action]).to('cpu')
            #state = torch.tensor([state], dtype=torch.float32).to('cpu')
        mu, log_sigma = torch.chunk(self.model(state), 2, dim=-1)
        self.sigma = nn.Parameter(torch.exp(log_sigma))
        distrib = Normal(mu, self.sigma)        
        action_prob = distrib.log_prob(action).sum(-1)
        return action_prob, distrib
        
    def act(self, state):
        # Returns an action (with tanh), not-transformed action (without tanh) and distribution of non-transformed actions
        # Remember: agent is not deterministic, sample actions from distribution (e.g. Gaussian)
        with torch.no_grad():
            #state = torch.tensor([state], dtype=torch.float32).to('cpu')
            mu, log_sigma = torch.chunk(self.model(state), 2, dim=-1)
        #self.sigma = nn.Parameter(torch.exp(log_sigma))
        sigma = torch.exp(log_sigma)
        distrib = Normal(mu, sigma)
        action = distrib.sample().cpu()#.numpy()[0]
        return np.tanh(action), action, distrib
        
        
class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ELU(),
            nn.Linear(256, 256),
            nn.ELU(),
            nn.Linear(256, 1)
        )
        
    def get_value(self, state):
        return self.model(state)


class PPO:
    def __init__(self, state_dim, action_dim):
        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)
        self.actor_optim = Adam(self.actor.parameters(), ACTOR_LR)
        self.critic_optim = Adam(self.critic.parameters(), CRITIC_LR)

    def update(self, trajectories):
        transitions = [t for traj in trajectories for t in traj] # Turn a list of trajectories into list of transitions
        state, action, old_prob, target_value, advantage = zip(*transitions)
        state = np.array(state)
        action = np.array(action)
        old_prob = np.array(old_prob)
        target_value = np.array(target_value)
        advantage = np.array(advantage)
        advnatage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
        
        
        for idx_l in range(BATCHES_PER_UPDATE):
            idx = np.random.randint(0, len(transitions), BATCH_SIZE) # Choose random batch
        #for batch_ofs in range(0, len(transitions), BATCH_SIZE):
            #batch_l = batch_ofs + BATCH_SIZE
            s = torch.tensor(state[idx]).float()
            a = torch.tensor(action[idx]).float()
            op = torch.tensor(old_prob[idx]).float() # Probability of the action in state s.t. old policy
            targets = torch.tensor(target_value[idx]).float() # Estimated by lambda-returns 
            adv = torch.tensor(advantage[idx]).float() # Estimated by generalized advantage estimation 
                       
            # TODO: Update critic here
            
            self.critic_optim.zero_grad()
            values = self.critic.get_value(s)
            critic_loss = F.mse_loss(values.squeeze(), targets)
            critic_loss.backward()
            self.critic_optim.step()
            
             # TODO: Update actor here
            self.actor_optim.zero_grad()
            log_prob_pi, action_distrib = self.actor.compute_proba(s, a)
            r = torch.exp(log_prob_pi - op)
            r_clipped = torch.clamp(r, 1.0 - CLIP, 1.0 + CLIP)
            
            entropy_loss = -action_distrib.entropy()
            actor_loss = -(torch.min(r * adv, r_clipped * adv)).mean() + ENTROPY_COEF * entropy_loss
            actor_loss.sum().backward()
            self.actor_optim.step()
            
            
    def get_value(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array([state])).float()
            value = self.critic.get_value(state)
        return value.cpu().item()

    def act(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array([state])).float()
            action, pure_action, distr = self.actor.act(state)
            prob = torch.exp(distr.log_prob(pure_action).sum(-1))
        return action.cpu().numpy()[0], pure_action.cpu().numpy()[0], prob.cpu().item()

    def save(self):
        torch.save(self.actor, "agent.pkl")


def evaluate_policy(env, agent, episodes=5):
    returns = []
    for _ in range(episodes):
        done = False
        state = env.reset()
        total_reward = 0.
        
        while not done:
            state, reward, done, _ = env.step(agent.act(state)[0])
            total_reward += reward
        returns.append(total_reward)
    return returns
   

def sample_episode(env, agent):
    s = env.reset()
    d = False
    trajectory = []
    while not d:
        a, pa, p = agent.act(s)
        v = agent.get_value(s)
        ns, r, d, _ = env.step(a)
        trajectory.append((s, pa, r, p, v))
        s = ns
    return compute_lambda_returns_and_gae(trajectory)

In [166]:
env = make(ENV_NAME)
ppo = PPO(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0])
state = env.reset()
episodes_sampled = 0
steps_sampled = 0

for i in tqdm(range(ITERATIONS)):
    trajectories = []
    steps_ctn = 0

    while len(trajectories) < MIN_EPISODES_PER_UPDATE or steps_ctn < MIN_TRANSITIONS_PER_UPDATE:
        traj = sample_episode(env, ppo)
        steps_ctn += len(traj)
        trajectories.append(traj)
    episodes_sampled += len(trajectories)
    steps_sampled += steps_ctn

    ppo.update(trajectories)        

    if (i + 1) % (ITERATIONS//100) == 0:
        rewards = evaluate_policy(env, ppo, 5)
        print(f"Step: {i+1}, Reward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}, Episodes: {episodes_sampled}, Steps: {steps_sampled}")
        ppo.save()

  0%|          | 0/1000 [00:00<?, ?it/s]

Step: 10, Reward mean: 32.13324137181538, Reward std: 14.824891031817376, Episodes: 668, Steps: 20679
Step: 20, Reward mean: 53.08222199150507, Reward std: 11.870823899245186, Episodes: 926, Steps: 41739
Step: 30, Reward mean: 57.99270657505841, Reward std: 18.64929467187126, Episodes: 1214, Steps: 62632
Step: 40, Reward mean: 46.96324923671579, Reward std: 6.491880535246846, Episodes: 1531, Steps: 83393
Step: 50, Reward mean: 62.683664823365405, Reward std: 5.618815119914279, Episodes: 1847, Steps: 104195
Step: 60, Reward mean: 46.937245299557404, Reward std: 16.61256230690758, Episodes: 2160, Steps: 124999
Step: 70, Reward mean: 45.92411787030753, Reward std: 24.014944284207797, Episodes: 2447, Steps: 145871
Step: 80, Reward mean: 56.72632210869256, Reward std: 20.591791806520696, Episodes: 2771, Steps: 166696
Step: 90, Reward mean: 53.38119336348261, Reward std: 21.370727712765632, Episodes: 3062, Steps: 187388
Step: 100, Reward mean: 59.67915498116315, Reward std: 26.60305481878024

KeyboardInterrupt: 