In [68]:
import gymnasium as gym
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.distributions import Normal

import numpy as np
import random as rd
import math

# Profiler
import cProfile
import re

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x32dfa73e0>

In [69]:
class ValueFunction(nn.Module):
    def __init__(self, in_features, out_features, hidden_features, critic_learning_rate):
        super().__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.hidden_features = hidden_features
        self.critic_learning_rate = critic_learning_rate
        
        self.critic = nn.Sequential(
            nn.Linear(self.in_features, self.hidden_features),
            nn.ReLU(),
            nn.Linear(self.hidden_features, self.hidden_features),
            nn.ReLU(),
            nn.Linear(self.hidden_features, 1),
            nn.ReLU()
        )

        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.critic_learning_rate)


    def forward(self, input):
        return self.critic(input)
    
class Policy(nn.Module):
    def __init__(self, in_features, out_features, hidden_features, actor_learning_rate, std, device):
        super().__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.hidden_features = hidden_features
        self.actor_learning_rate = actor_learning_rate
        self.std = std
        self.device = device

        self.actor = nn.Sequential(
            nn.Linear(self.in_features, self.hidden_features),
            nn.ReLU(),
            nn.Linear(self.hidden_features, self.hidden_features),
            nn.ReLU(),
            nn.Linear(self.hidden_features, self.out_features),
            nn.Tanh()
        )

        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.actor_learning_rate)

    def forward(self, in_):

        if in_.dim() == 1:
            in_ = in_.unsqueeze(0)
        h = self.actor(in_)

        epsilon = torch.randn(h.size(0), h.size(1)).to(self.device)
        z = h + self.std * epsilon
        return z, h, self.std
    
    def get_log_probability(self, z, mu, std):

        coeff =  1 / (std*math.sqrt(2*math.pi))
        normal_dist = coeff * torch.exp(-0.5 * (((z - mu) / std) ** 2) )
        assert(normal_dist.all() >= 0)

        return torch.log(normal_dist).sum(dim=-1)

In [70]:
class PPO(nn.Module):
    def __init__(self, epochs, training_iterations, batch_size, trajectory_length, n_actors, env, in_features, out_features, hidden_features, device, actor_learning_rate, critic_learning_rate, gamma, lambda_, epsilon, std, beta, d_targ, mode):
        super().__init__()
        
        self.epochs = epochs
        self.training_iterations = training_iterations
        self.batch_size = batch_size
        self.trajectory_length = trajectory_length
        self.n_actors = n_actors
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_features = hidden_features
        self.gamma = gamma
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.beta = beta
        self.d_targ = d_targ
        self.env = env
        self.device = device
        self.std = std
        self.actor_learning_rate = actor_learning_rate
        self.mode = mode

        self.actor = Policy(in_features, out_features, hidden_features, actor_learning_rate, std, device)
        self.critic = ValueFunction(in_features, out_features, hidden_features, critic_learning_rate)

    
    def train_model(self):

        N = self.n_actors  #number of actors
        T = self.trajectory_length # trajectory length
        
        for i in range(self.training_iterations):
            dataset = []
            print(f"[train]: starting dataset creation at iteration n {i}")

            with torch.no_grad():
                adv_list = []
                cum_reward = 0
                for _ in range(N): #for each actor

                    # initialize first state
                    s_prime, _ = self.env.reset()
                    s_prime = torch.tensor(s_prime, dtype=torch.float32).to(self.device)

                    trajectory = []
                    done = False

                    for t in range(T):
                        action, mu, std = self.actor(s_prime)
                        log_policy = self.actor.get_log_probability(action, mu, std)

                        s, reward, terminated, truncated, _ = self.env.step(action.squeeze(0).cpu().detach().numpy())
                        s = torch.tensor(s, dtype=torch.float32).to(self.device)
                        reward = torch.tensor([[reward]], dtype=torch.float32).to(self.device)
                        s_prime = s_prime.unsqueeze(0)
                        trajectory.append([s_prime, action, reward, log_policy])
                        s_prime = s
                        cum_reward += reward

                        done = terminated or truncated
                        if done:
                            break


                    dynamic_target = 0 if done else self.critic(s)
                    for t in range(len(trajectory)-1, -1, -1): #I want the range from [T-1 to 0]
                        
                        dynamic_target = dynamic_target*self.gamma + trajectory[t][2] #taking the reward
                        advantage = dynamic_target - self.critic(trajectory[t][0])
                        trajectory[t] = tuple(trajectory[t] + [dynamic_target.unsqueeze(0), advantage.unsqueeze(0)])

                        dataset.append(trajectory[t])
                        adv_list.append(advantage)

                adv_std, adv_mean = torch.std_mean(torch.tensor(adv_list))
                print(f"[training]: cum reward {cum_reward}")
            
            print(f"[training]: ending dataset creation with dataset size {len(dataset)}")

            self.actor.zero_grad()
            self.critic.zero_grad()
            # Starts the training process
            for e in range(self.epochs):
                
                print(f"[train]: epoch n {e}")
                avg_loss_value = 0
                avg_loss_ppo = 0
                rd.shuffle(dataset) #shuffle in-place
                
                assert(self.batch_size <= len(dataset))

                for mini_idx in range(0, len(dataset), self.batch_size):
                    
                    # form mini_batch
                    mini_batch = dataset[mini_idx: mini_idx+self.batch_size]

                    state_mini = torch.stack(list(map(lambda elem: elem[0].squeeze(), mini_batch)))
                    action_mini = torch.stack(list(map(lambda elem: elem[1].squeeze(), mini_batch)))
                    log_policy_mini = torch.stack(list(map(lambda elem: elem[3].squeeze(), mini_batch)))
                    advantage_mini = torch.stack(list(map(lambda elem: elem[4].squeeze(), mini_batch)))
                    target_mini = torch.stack(list(map(lambda elem: elem[5].squeeze(), mini_batch)))
                    
                    # Normalize advantage_mini
                    advantage_mini = ((advantage_mini-adv_mean) / (adv_std+0.00001))

                    _, mu_mini, std_mini = self.actor(state_mini) # std is a scalar!
                    new_log_policy_mini = self.actor.get_log_probability(action_mini, mu_mini, std_mini)   

                    new_value_mini = self.critic(state_mini)
                    
                    self.actor.optimizer.zero_grad()
                    self.critic.optimizer.zero_grad()
                    
                    if self.mode == 'clip':
                        loss_ppo = self.loss_clip(new_log_policy_mini, log_policy_mini, advantage_mini)
                    elif (self.mode == 'kl_fixed') or (self.mode == 'kl_adaptive'):
                        loss_ppo = self.loss_kl(new_log_policy_mini, log_policy_mini, advantage_mini)

                    loss_value = self.loss_value(new_value_mini, target_mini)

                    avg_loss_ppo += loss_ppo
                    avg_loss_value += avg_loss_value

                    loss_ppo.backward()
                    loss_value.backward()
                    
                    self.actor.optimizer.step()
                    self.critic.optimizer.step()


                total_minibatch = math.floor(len(dataset) // self.batch_size)
                print(f"[avg actor loss]: {avg_loss_ppo / total_minibatch} \t[critic loss]: {loss_value / total_minibatch}")


            self.save_parameters("model"+str(i)+".pt")
                

    def loss_value(self, value, target):
        #MSE
        return torch.mean((value-target)**2)

    def loss_clip(self, new_log_policy_mini, log_policy_mini, advantage_mini):

        prob_mini = torch.exp(new_log_policy_mini - log_policy_mini)
        prob_adv = prob_mini*advantage_mini
        clip_ = torch.clip(prob_mini, 1-self.epsilon, 1+self.epsilon)*advantage_mini
        return -torch.min(prob_adv, clip_).mean()
    
    def loss_kl(self, new_log_policy_mini, log_policy_mini, advantage_mini):
        
        prob_mini = torch.exp(new_log_policy_mini - log_policy_mini)
        prob_adv = prob_mini * advantage_mini
        d = log_policy_mini - new_log_policy_mini

        if self.mode == 'kl_adaptive':
            if d.detach().mean() < (self.d_targ / 1.5):
                self.beta = self.beta / 2
            elif d.detach().mean() > (self.d_targ * 1.5):
                self.beta = self.beta * 2

        return -(prob_adv - self.beta*d).mean()
        

    def extract_states_prime(self, trajectory):
        return list(map(lambda x: x[0], trajectory))
    
    def save_parameters(self, path):
        torch.save(self.state_dict(), path)



In [None]:
env = gym.make('HalfCheetah-v5', ctrl_cost_weight=0.1)

epochs = 10
training_iterations = 20
batch_size = 64
trajectory_length = 500
n_actors = 10
in_features = env.observation_space.shape[0]
out_features = env.action_space.shape[0]
hidden_features = 64
actor_learning_rate = 1e-4
critic_learning_rate = 1e-4
gamma = 0.99
lambda_ = 0.95
epsilon = 0.2
beta = 1
d_targ = 0.01
std = 0.5
device = "mps"
mode = "kl_adaptive"
modes = ["kl_fixed", "kl_adaptive", "clip", "pc"]
assert(mode in modes)



In [72]:
ppo = PPO(epochs=epochs, 
          training_iterations=training_iterations,
          batch_size=batch_size,
          trajectory_length=trajectory_length, 
          n_actors=n_actors,
          env=env,
          in_features=in_features,
          out_features=out_features,
          hidden_features=hidden_features,
          device=device,
          actor_learning_rate=actor_learning_rate,
          critic_learning_rate=critic_learning_rate,
          gamma=gamma,
          lambda_=lambda_,
          epsilon=epsilon,
          beta = beta,
          d_targ=d_targ,
          std=std,
          mode = mode,
        )
ppo.to(device)

PPO(
  (actor): Policy(
    (actor): Sequential(
      (0): Linear(in_features=17, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=6, bias=True)
      (5): Tanh()
    )
  )
  (critic): ValueFunction(
    (critic): Sequential(
      (0): Linear(in_features=17, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=1, bias=True)
      (5): ReLU()
    )
  )
)

In [None]:
#ppo.load_state_dict(torch.load("final.pt"))
ppo.train_model()
ppo.save_parameters("final_adaptive.pt")

[train]: starting dataset creation at iteration n 0
[training]: cum reward tensor([[-1677.2737]], device='mps:0')
[training]: ending dataset creation with dataset size 5000
[train]: epoch n 0
[avg actor loss]: -0.008941580541431904 	[critic loss]: 15.614160537719727
[train]: epoch n 1
[avg actor loss]: 0.39657339453697205 	[critic loss]: 9.408950805664062
[train]: epoch n 2
[avg actor loss]: -0.009127650409936905 	[critic loss]: 20.85405731201172
[train]: epoch n 3
[avg actor loss]: -0.018223851919174194 	[critic loss]: 15.105571746826172
[train]: epoch n 4
[avg actor loss]: -0.022643450647592545 	[critic loss]: 11.20498275756836
[train]: epoch n 5
[avg actor loss]: -0.022687431424856186 	[critic loss]: 8.253515243530273
[train]: epoch n 6
[avg actor loss]: -0.013345897197723389 	[critic loss]: 25.495573043823242
[train]: epoch n 7
[avg actor loss]: -0.022337574511766434 	[critic loss]: 9.878246307373047
[train]: epoch n 8
[avg actor loss]: -0.021400855854153633 	[critic loss]: 14.4061

KeyboardInterrupt: 

: 

In [None]:
device = 'cpu'

ppo = PPO(epochs=epochs, 
          training_iterations=training_iterations,
          batch_size=batch_size,
          trajectory_length=trajectory_length, 
          n_actors=n_actors,
          env=env,
          in_features=in_features,
          out_features=out_features,
          hidden_features=hidden_features,
          device=device,
          actor_learning_rate=actor_learning_rate,
          critic_learning_rate=critic_learning_rate,
          gamma=gamma,
          lambda_=lambda_,
          epsilon=epsilon,
          beta = beta,
          std=std,
          mode = mode,
        )

ppo.load_state_dict(torch.load("final1.pt"))

env = gym.make('HalfCheetah-v5', ctrl_cost_weight=0.1, render_mode="human")
#env = gym.make('HalfCheetah-v5', ctrl_cost_weight=0.1)
rewards = []
for episode in range(10):
    print(f"ep n {episode}", "\r")
    total_reward = 0
    done = False
    s, _ = env.reset()
    while not done:
        s = torch.tensor(s, dtype=torch.float32)
        z, mu, std = ppo.actor(s)
        s, reward, terminated, truncated, info = env.step(z.squeeze().cpu().detach().numpy())
        s = torch.tensor(s, dtype=torch.float32)
        done = terminated or truncated
        total_reward += reward

ep n 0 


  s = torch.tensor(s, dtype=torch.float32)


ep n 1 
ep n 2 
ep n 3 
ep n 4 
ep n 5 
ep n 6 
ep n 7 
ep n 8 
ep n 9 
