# PPO and A2C

**Note** : this script is inspired from the 1st assignment (without correction) from the RL course of the MVA master by A. Lazaric and M. Pirotta, on finite MDP and function approximation, which required to complete a partial implementation of A2C for discrete action space. It has been extended to include a different critic and actor architecture, continuous action space, and the clipped and adaptative KL losses required for PPO.

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

try :
    import Box2D
except :
    !pip install Box2D
import pickle as pkl

In [3]:
try :
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd /content/drive/My\ Drive/RL-PPO
except :
    print("Script running locally")
from config import reset_config, get_arguments
from utils import plot_sumup
# from ppo import PPOAgent

Script running locally


In [3]:
def reset_config(print_=False):
    config = {}
    config['env'] = "BipedalWalker-v3" ## to choose
#     config['env'] = 'MountainCarContinuous-v0'
    config['std'] = 0.5 # use constant standard deviation for continuous action space (for now)
    config['gamma'] = 0.99 #Discount rate
    config['lambda'] = 1 # parameter of the generalized advantage estimation
    config['lr'] = 0.0003
    config['eps_clipping'] = 0.2 #range : 0.1-0.3
    config['d_targ'] = 0.01
    config['beta_KL'] = 3
    config['c1'] = 1 #paramter of the value function loss
    config['c2'] = 1e-3 #entropy parameter --> 1e-4 to 1e-2
    config["reward_norm"]=False 
    config['epochs'] = 1
    config['max_episodes'] = 1000
    config['max_steps'] = 300
    config['optimize_every'] = 128
    config['batch_size'] = 128
    config["randomize_batch"]=False
    # config['buffer_size'] = 2048 #2048 - 409600 /!\ multiple of the batch size
    config['loss_name'] = ["A2C_loss","adaptative_KL_loss","clipped_loss"][2]
    config['color'] = {"A2C_loss":sns.color_palette("Set2")[0],"adaptative_KL_loss":sns.color_palette("Set2")[1],"clipped_loss":sns.color_palette("Set2")[2]}

    config['seed'] = 42
    config["reset_val"] = None # use to reset the environment with a custom value
    config["solved_reward"] = {'LunarLander-v2':230,
                              'MountainCarContinuous-v0':300,
                              'CartPole-v1':300,
                              'MountainCar-v0':300}
    
    if print_== True :
        print("Training config : \n")
        pprint(config)
    return config


config = reset_config(print_=True)

Training config : 

{'batch_size': 128,
 'beta_KL': 3,
 'c1': 1,
 'c2': 0.001,
 'color': {'A2C_loss': (0.4, 0.7607843137254902, 0.6470588235294118),
           'adaptative_KL_loss': (0.9882352941176471,
                                  0.5529411764705883,
                                  0.3843137254901961),
           'clipped_loss': (0.5529411764705883,
                            0.6274509803921569,
                            0.796078431372549)},
 'd_targ': 0.01,
 'env': 'BipedalWalker-v3',
 'epochs': 1,
 'eps_clipping': 0.2,
 'gamma': 0.99,
 'lambda': 1,
 'loss_name': 'clipped_loss',
 'lr': 0.0003,
 'max_episodes': 1000,
 'max_steps': 300,
 'optimize_every': 128,
 'randomize_batch': False,
 'reset_val': None,
 'reward_norm': False,
 'seed': 42,
 'solved_reward': {'CartPole-v1': 300,
                   'LunarLander-v2': 230,
                   'MountainCar-v0': 300,
                   'MountainCarContinuous-v0': 300},
 'std': 0.5}


In [123]:
import pandas as pd
import itertools
import numpy as np
import datetime
import gym
from gym.spaces import Discrete

import torch
from torch import optim
import torch.nn.functional as F
from torch.distributions import MultivariateNormal
from torch.distributions import Normal

from memory import Memory
# from networks import CustomValueNetwork, CustomDiscreteActorNetwork, ContinuousActorNetwork


class PPOAgent:

	def __init__(self, config):

		self.config = config
		self.memory = Memory()
		self.device = 'cpu'
		self.env = gym.make(config['env'])

		# boolean for discrete action space:
		self.discrete_action_bool = isinstance(self.env.action_space, Discrete)
		self.gamma = config['gamma']
		self.lambd = config['lambda']
		self.c1 = config['c1']
		self.c2 = config['c2']
		self.norm_reward = config["reward_norm"]
		self.loss_name = config['loss_name']
		self.beta_kl = config['beta_KL']

		self.batch_size = config["batch_size"]
		if not(self.discrete_action_bool):
			print("Low : ", self.env.action_space.low)
			print("High : ", self.env.action_space.high)

		# set random seeds
		np.random.seed(config['seed'])
		torch.manual_seed(config['seed'])
		self.env.seed(config['seed'])

		# Critic
		self.value_network = CustomValueNetwork(self.env.observation_space.shape[0], 64, 1).to(self.device)
		self.value_network_optimizer: optim.Optimizer = optim.Adam(
			self.value_network.parameters(), lr=config['lr'])

		# Actor
		if self.discrete_action_bool:
			self.actor_network = CustomDiscreteActorNetwork(self.env.observation_space.shape[0], 64, self.env.action_space.n).to(self.device)
		else:
			self.actor_network = ContinuousActorNetwork(self.env.observation_space.shape[0], 64, self.env.action_space.shape[0], self.config["std"], self.env).to(self.device)

		self.actor_network_optimizer: optim.Optimizer = optim.Adam(
			self.actor_network.parameters(), lr=config['lr'])

		# save in memory policy estimates
		self.probs_list = []    # probability of actions taken
		self.mean_list = []     # mean estimate (for continuous action)

	def _returns_advantages(self, values, next_value):
		"""Returns the cumulative discounted rewards with GAE

		Parameters
		----------
		rewards : array
			An array of shape (batch_size,) containing the rewards given by the env
		dones : array
			An array of shape (batch_size,) containing the done bool indicator given by the env
		values : array
			An array of shape (batch_size,) containing the values given by the value network
		next_value : float
			The value of the next state given by the value network

		Returns
		-------
		returns : array
			The cumulative discounted rewards
		advantages : array
			The advantages
		"""

		rewards = np.array(self.memory.rewards)
		if self.norm_reward:
			rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

		returns, advantages = [], []
		last = next_value
		gae = 0

		for i in reversed(range(len(self.memory))):
			# build the returns
			returns.insert(0, rewards[i] + self.gamma*last*(1-self.memory.dones[i]))

			# build the advantages
			delta = rewards[i] + self.gamma*next_value*(1-self.memory.dones[i]) - values[i]
			gae = delta + self.gamma*self.lambd*(1-self.memory.dones[i])*gae
			advantages.insert(0, gae)
			next_value = values[i]

		returns = torch.FloatTensor(returns).to(self.device)
		advantages = torch.FloatTensor(advantages).to(self.device)

		return returns, advantages

	def training(self, epochs, optimize_every, max_episodes, max_steps):
		t1 = datetime.datetime.now()
		"""Perform a training by batch
			Parameters
			----------
			epochs : int
				Number of epochs
			batch_size : int
				The size of a batch"""

		episode_count = 0
		timestep_count = 0
		rewards_test = []
		solved = False

		loss_evol = {'loss': [], 'dry_loss': [], 'entropy': []}
		if self.loss_name not in ["A2C_loss", "adaptative_KL_loss", "clipped_loss"]:
			print('Unknown loss function, using clipped loss as default loss')
		else:
			print('Loss : ', self.loss_name)

		for ep in range(max_episodes):
			print("episode number "+str(ep)+"/"+str(max_episodes))
			if not solved:
				episode_count += 1
				obs = self.env.reset()

				for i in range(max_steps):
					timestep_count += 1
					# just observed s_t
					self.memory.observations.append(obs)
					#print(self.memory.observations[0].shape)
					# tensor
					obs_t = torch.from_numpy(obs).float().to(self.device)
					# act on just observed, action a_t
					action = self.actor_network.select_action(obs_t.view(1,-1))
# 					print(obs_t.view(1,-1).size())
# 					print(action)

					if self.discrete_action_bool:
						action = int(action)
					self.memory.actions.append(action)

					# Run a step : get new state s_{t+1} and rewards r_t
					#print(action)
					obs, reward, done, _ = self.env.step(action.view(-1))

					# Store termination status reward
					self.memory.dones.append(done)
					self.memory.rewards.append(reward)



					if (timestep_count % optimize_every) == 0:
						# print("starting optim")

						for epoch in range(epochs):
							# print("epoch",epoch)
							loss_val, dry_loss_val, entrop_val = self.optimize_model(obs)
							if epoch == epochs-1:
								loss_evol["loss"].append(loss_val)
								loss_evol["dry_loss"].append(dry_loss_val)
								loss_evol["entropy"].append(entrop_val)

						self.memory.clear_memory()

					if done:
						break

			# Test every 25 episodes
			if ep == 1 or (ep > 0 and ep % 25 == 0) or (ep == max_episodes - 1):
				rewards_test.append(np.array([self.evaluate() for _ in range(50)]))
				print(f'Episode {ep}/{max_episodes}: Mean rewards: {round(rewards_test[-1].mean(), 2)}, Std: {round(rewards_test[-1].std(), 2)}')
				if round(rewards_test[-1].mean(), 2) == 500.:
					solved = True

		self.env.close()
		t2 = datetime.datetime.now()

		# save rewards
		r = pd.DataFrame((itertools.chain(*(itertools.product([i], rewards_test[i]) for i in range(len(rewards_test))))), columns=['Episode', 'Reward'])
		r["Episode"] = r["Episode"]*25
		r["loss_name"] = self.loss_name

		# Total time ellapsed
		time = t2-t1
		print(f'The training was done over a total of {episode_count} episodes')
		print('Total time ellapsed during training : ', time)
		r["time"] = time
		loss_evol = pd.DataFrame(loss_evol).astype(float)
		loss_evol["loss_name"] = self.loss_name
		loss_evol["Update"] = range(len(loss_evol))

		return r, loss_evol

	def compute_proba_ratio(self, prob, actions):
		if self.discrete_action_bool:
			# 1st iteration : initialize old policy to the current one to avoid clipping
			if len(self.probs_list) == 1:
				old_prob = self.probs_list[0]
			else:
				old_prob = self.probs_list[len(self.probs_list)-2]

		else:
			if len(self.mean_list) == 1:
				old_prob_mean = self.mean_list[0]
			else:
				old_prob_mean = self.mean_list[len(self.mean_list)-2]

			#m = normal.Normal(loc=old_prob_mean.float(), scale=torch.tensor(config["std"]*np.ones(actions.size())).float())
			#old_prob = m.log_prob(actions.float()).reshape(actions.size()).detach()
			#print(prob.size())
			#print(old_prob_mean.size())

			# build old probabilities 
			#cov_mat = torch.eye(old_prob_mean.size()[1])*self.config["std"]
			#dist = MultivariateNormal(old_prob_mean, cov_mat)

# 			print("old prob mean",old_prob_mean.size()[1])
# 			print("old prob mean",torch.tensor(self.config['std']*np.ones(old_prob_mean.size()[1])).float())
			diag = torch.tensor(self.config['std']*np.ones(old_prob_mean.size()[1])).float()
# 			print(diag)
			dist = Normal(old_prob_mean, scale = diag)
			#print("hey")
			#print(old_prob_mean.size())
			#print(cov_mat.size())
			#print(actions.size())
			old_prob = dist.log_prob(actions).detach()

			# build new ones
			#print(prob)
			#dist = MultivariateNormal(prob, cov_mat)
			dist = Normal(prob, scale = diag)
			prob = dist.log_prob(actions)

		if self.discrete_action_bool:
			# compute the ratio directly using gather function
			num = prob.gather(1, actions.long().view(-1, 1))
			denom = old_prob.detach().gather(1, actions.long().view(-1, 1))
			ratio_vect = num.view(-1)/denom.view(-1)

		else:
			if np.isnan(prob.cpu().detach().numpy()).any():
				print("NaN encountered in num ratio")

			if np.isnan(old_prob.cpu().detach().numpy()).any():
				print("NaN encountered in denom ratio")

			ratio_vect = prob/(old_prob+1e-6)

		if np.isnan(ratio_vect.cpu().detach().numpy()).any():
			print("NaN encountered in proba ratio")

		return ratio_vect, old_prob


	def clipped_loss(self, prob, actions, advantages):

		ratio_vect = self.compute_proba_ratio(prob, actions)[0]
		#print(ratio_vect.size())

		if len(actions.size()) > 1 and self.discrete_action_bool == False :
# 			print("more than one action")
			ratio_vect = torch.prod(ratio_vect, dim=1)
# 			print("ratio_vect",ratio_vect)

		# Compute the loss
# 		print("ratio vect dim",ratio_vect.size())
# 		print("adv dim",advantages.size())
		loss1 = ratio_vect * advantages
		loss2 = torch.clamp(ratio_vect, 1-self.config['eps_clipping'], 1+self.config['eps_clipping']) * advantages
		loss = - torch.sum(torch.min(loss1, loss2))
		return loss


	def adaptative_KL_loss(self, prob, actions, advantages, observations):

		if self.discrete_action_bool:
			ratio_vect, old_prob = self.compute_proba_ratio(prob, actions)
			kl = torch.zeros(1)
			for i in range(prob.size()[0]):
				kl += (old_prob[i] * (old_prob[i].log() - prob[i].log())).sum()

		else:
			ratio_vect = self.compute_proba_ratio(prob, actions)[0]
			if len(self.mean_list) == 1:
				kl = torch.tensor(0.)
			else:
				mu = prob.view(-1)
				mu_old = self.mean_list[len(self.mean_list)-2].view(-1).detach()
				kl = torch.dot((mu-mu_old)/torch.tensor(config["std"]*np.ones(len(actions))).float(), mu-mu_old)/2

		loss = - torch.sum((ratio_vect*advantages)) + self.beta_kl*kl

		# Update beta values
		if np.isnan(torch.mean(kl).cpu().detach().numpy()):
			print("Nan encountered in average KL divergence")
		if kl < self.config["d_targ"]/1.5:
			self.beta_kl = self.beta_kl / 2
		elif kl > self.config["d_targ"]*1.5:
			self.beta_kl = self.beta_kl * 2
		return loss


	def A2C_loss(self, prob, actions, advantages):
		loss = 0.
		if self.discrete_action_bool:
			for i in range(len(actions)):
				loss -= torch.log(prob[i, int(actions[i])]+1e-6)*advantages[i]
		else:
# 			print(prob.size(),"1")
			#cov_mat = torch.eye(prob.size()[1])*self.config["std"]
			#print(cov_mat.size())
			#dist = MultivariateNormal(prob, cov_mat)
			diag = torch.tensor(self.config["std"]*np.ones(prob.size()[1])).float()
# 			print(diag.size())
			dist = Normal(prob, scale = diag)
			prob = dist.log_prob(actions)
# 			print(actions.size())
# 			print(prob.size(),"2")
			if actions.size()[1]>1:
				prob = torch.prod(prob, dim=1)
# 				print(prob.size())


			loss = torch.dot(torch.log(prob.view(-1)+1e-6), advantages)

		return loss


	def optimize_model(self, next_obs):
# 		print("here")

		losses = {"loss": [], "dry_loss": [], "entropy": []}
		idx = torch.arange(len(self.memory))

		observations = torch.tensor(self.memory.observations).float().to(self.device)
		# print(observations.size())
		if np.isnan(observations.cpu().detach().numpy()).any():
			print("nan in observations")

		if self.discrete_action_bool:		
			actions = torch.tensor(self.memory.actions).float().to(self.device)
		else:
			actions = torch.squeeze(torch.stack(self.memory.actions),1).float().to(self.device)
# 			print(actions.size(), "bouya")


		next_obs = torch.from_numpy(next_obs).float().to(self.device)
		next_value = self.value_network.predict(next_obs)
		values = self.value_network(observations)
		returns, advantages = self._returns_advantages(values, next_value)
		returns = returns.float().to(self.device)
		advantages = advantages.float().to(self.device)

		for i in range(0, returns.size()[0], self.batch_size):

			indices = idx[i:i+self.batch_size]
			batch_observations = observations[i:i+self.batch_size]
			batch_actions = actions[i:i+self.batch_size]
			batch_returns = returns[i:i+self.batch_size]
			batch_advantages = advantages[i:i+self.batch_size]

			# Critic loss
			net_values: torch.Tensor = self.value_network(batch_observations)
			critic_loss = F.mse_loss(net_values.view(-1), batch_returns)
			critic_loss.backward()
			self.value_network_optimizer.step()

			# Actor & Entropy loss
# 			print(batch_observations.size())
			if np.isnan(batch_observations.cpu().detach().numpy()).any():
				print("nan in batch observations")


			prob: torch.Tensor = self.actor_network.forward(batch_observations)
			if np.isnan(prob.cpu().detach().numpy()).any():
# 				print(batch_observations.size(),"d")
				print("NAN HERE")


			if self.discrete_action_bool:
				self.probs_list.append(prob.detach())

			else:
				#cov_mat = torch.eye(prob.size()[1])*self.config["std"]
				#dist = MultivariateNormal(prob, cov_mat)
				#logprob = dist.log_prob(batch_actions)
				#print(logprob.size())
				#dist_entropy = dist.entropy()
				#state_value = self.critic(state)
				#m = normal.Normal(loc=prob.float(), scale=torch.tensor(config["std"]*np.ones(actions.size())).float())
				#logprob = m.log_prob(batch_actions.float()).reshape(batch_actions.size())
				#self.probs_list.append(torch.exp(logprob).detach()) # not very useful
				# append the gaussian mean (used to estimate old probability)
				self.mean_list.append(prob.detach())

			if self.loss_name == "clipped_loss":
				loss = self.clipped_loss(prob, batch_actions, batch_advantages)

			elif self.loss_name == "adaptative_KL_loss":
				loss = self.adaptative_KL_loss(prob, batch_actions, batch_advantages, batch_observations)

			elif self.loss_name == "A2C_loss":
				loss = self.A2C_loss(prob, batch_actions, batch_advantages)

			else:  # use clipped loss as default
				loss = self.clipped_loss(prob, batch_actions, batch_advantages)

			dry_loss = loss
			entropy_term = -torch.sum(prob * torch.log(prob+1e-6))
			# entropy_term = -torch.sum(prob * torch.log(prob+1e-6), dim=1)
			loss -= (self.c2 * entropy_term)
			# loss = loss.sum() - (self.c2 * entropy_term)
			# loss = loss / n_trajs

			loss.backward()
			# loss.sum().backward()
			# loss.mean().backward()
			self.actor_network_optimizer.step()
			self.value_network_optimizer.zero_grad()
			self.actor_network_optimizer.zero_grad()

			losses["loss"].append(loss.mean().item())
			losses["dry_loss"].append(dry_loss.mean().item())
			losses["entropy"].append(entropy_term.mean().item())

		return np.mean(losses["loss"]), np.mean(losses["dry_loss"]), np.mean(losses["entropy"])

	def evaluate(self, render=False):
		env = self.monitor_env if render else self.env
		observation = env.reset()

		observation = torch.from_numpy(observation).float().to(self.device)
		reward_episode = 0
		done = False
		with torch.no_grad():
			while not done:
				policy = self.actor_network(observation)

				if self.discrete_action_bool:
					action = int(torch.multinomial(policy, 1))
				else:
					action = self.actor_network.select_action(observation)
# 				print("evaluate : sampled action : ",action)
				observation, reward, done, info = env.step(action.view(-1))
				observation = torch.from_numpy(observation).float().to(self.device)
				reward_episode += reward

		env.close()
		if render:
			show_video("./gym-results")
			print(f'Reward: {reward_episode}')
		return reward_episode


In [124]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import MultivariateNormal
from torch.distributions import Normal

import sys


import numpy as np 


class CustomValueNetwork(nn.Module):
	"""
	Approximation of the value function V of a state given as input
	FC network with 1 hidden layer and ReLU activations
	Class used as 'critic'
	Inputs :
	input_size : dimension of each state
	hidden_size : number of hidden layers
	output_size : 1 (dimension of the value function estimate)
	"""

	def __init__(self, input_size, hidden_size, output_size=1):
		super(CustomValueNetwork, self).__init__()
		self.fc1 = nn.Linear(input_size, hidden_size)
		self.fc2 = nn.Linear(hidden_size, hidden_size)
		self.fc3 = nn.Linear(hidden_size, output_size)

	def forward(self, x):
		out = F.relu(self.fc1(x.float()))
		out = F.relu(self.fc2(out))
		out = self.fc3(out)
		return out

	def predict(self, x):
		return self(x).cpu().detach().numpy()[0]


class CustomDiscreteActorNetwork(nn.Module):
	"""
	Custom policy model network for discrete action space
	Inputs :
	input_size : state space dimension
	hidden_size : nb of hidden layers (64 in author's paper for continous action space)
	action_size : action space dimension
	"""
	def __init__(self, input_size, hidden_size, action_size):
		super(CustomDiscreteActorNetwork, self).__init__()
		self.fc1 = nn.Linear(input_size, hidden_size)
		self.fc2 = nn.Linear(hidden_size, hidden_size)
		self.fc3 = nn.Linear(hidden_size, action_size)

	def forward(self, x):
		out = torch.tanh(self.fc1(x))
		out = torch.tanh(self.fc2(out))
		out = torch.tanh(self.fc2(out))
		out = F.softmax(self.fc3(out), dim=-1)
		return out

	def select_action(self, x):
		return torch.multinomial(self(x), 1).cpu().detach().numpy()


class ContinuousActorNetwork(nn.Module):
	"""
	Policy model network for continuous action space (from the paper)
	Inputs :
	input_size : state space dimension
	hidden_size : nb of hidden layers used by the authors
	action_size : action space dimension
	"""
	def __init__(self, input_size, hidden_size, action_size, std, env):
		super(ContinuousActorNetwork, self).__init__()
		self.fc1 = nn.Linear(input_size, hidden_size)
		self.fc2 = nn.Linear(hidden_size, hidden_size)
		self.fc3 = nn.Linear(hidden_size, action_size)
		self.std = std
		self.env = env
		self.activ = nn.ReLU()

	def forward(self, x):
		if np.isnan(x.cpu().detach().numpy()).any():
			print("nan at the beginning")
# 		out = torch.tanh(self.fc1(x.float()))
		out = self.activ(self.fc1(x.float()))
		if np.isnan(out.cpu().detach().numpy()).any():
			print("nan after 1")
# 		print("weights",self.fc1.weight)
# 		print("weights",self.fc1.weight.size())
# 		print("min weights",torch.min(self.fc1.weight))
# 		print("max weights",torch.max(self.fc1.weight))
# 		print("minbias",torch.min(self.fc1.bias))
# 		print("maxbias",torch.max(self.fc1.bias))
# 		out = torch.tanh(self.fc2(out))
		out = self.activ(self.fc2(out))
		if np.isnan(out.cpu().detach().numpy()).any():
			print("nan after 2")
# 		out = torch.tanh(self.fc2(out))
		out = self.activ(self.fc2(out))
		if np.isnan(out.cpu().detach().numpy()).any():
			print("nan after 2 bis")
# 		out = torch.tanh(self.fc3(out))
		out = self.activ(self.fc3(out))
		if np.isnan(out.cpu().detach().numpy()).any():
			#print(batch_observations)
			print("Nan at the end of forward")
			print("input tensor x : ",x.float())
			sys.exit(0)
		return out


	def select_action(self, x):
		action_mean = self.forward(x)

		if np.isnan(action_mean.cpu().detach().numpy()).any():
			#print(batch_observations)
			print("Naaaaaaaaan")
		if len(action_mean.size()) == 1 :
# 			print("modify size")
			action_mean = action_mean.unsqueeze(0)
# 			print(action_mean)
		#cov_mat = torch.eye(action_mean.size()[1])*self.std
		#dist = MultivariateNormal(action_mean, cov_mat)

		dist = Normal(action_mean, scale = torch.tensor(self.std*np.ones(action_mean.size()[1])).float())
		action = dist.sample()
# 		print("sampled action",action)
		if np.isnan(action.cpu().detach().numpy()).any():
			print("NAAAAAAAAAAAAAN")
		#action_logprob = dist.log_prob(action)
		#sampled_a = max(self.env.action_space.low, sampled_a)
		#sampled_a = min(self.env.action_space.high, sampled_a)
	 
		return action.detach()



In [None]:
rewards_list = []
loss_list = []
config["epochs"]=1
for loss in ["clipped_loss","adaptative_KL_loss","A2C_loss"]:
# for loss in ["A2C_loss"]:
    print("-----------------"+loss+"-----------------")
    config["loss_name"]=loss
    print(config)
    agent = PPOAgent(config)
    
    rewards, loss = agent.training(config["epochs"], config["optimize_every"], config["max_episodes"], config["max_steps"])
    rewards_list.append(rewards)
    loss_list.append(loss)

-----------------clipped_loss-----------------
{'env': 'BipedalWalker-v3', 'std': 0.5, 'gamma': 0.99, 'lambda': 1, 'lr': 0.0003, 'eps_clipping': 0.2, 'd_targ': 0.01, 'beta_KL': 3, 'c1': 1, 'c2': 0.001, 'reward_norm': False, 'epochs': 1, 'max_episodes': 1000, 'max_steps': 300, 'optimize_every': 128, 'batch_size': 128, 'randomize_batch': False, 'loss_name': 'clipped_loss', 'color': {'A2C_loss': (0.4, 0.7607843137254902, 0.6470588235294118), 'adaptative_KL_loss': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961), 'clipped_loss': (0.5529411764705883, 0.6274509803921569, 0.796078431372549)}, 'seed': 42, 'reset_val': None, 'solved_reward': {'LunarLander-v2': 230, 'MountainCarContinuous-v0': 300, 'CartPole-v1': 300, 'MountainCar-v0': 300}}
Low :  [-1. -1. -1. -1.]
High :  [1. 1. 1. 1.]
Loss :  clipped_loss
episode number 0/1000
episode number 1/1000
Episode 1/1000: Mean rewards: -97.45, Std: 20.98
episode number 2/1000
episode number 3/1000
episode number 4/1000
episode number 5/10

episode number 288/1000
episode number 289/1000
episode number 290/1000
episode number 291/1000
episode number 292/1000
episode number 293/1000
episode number 294/1000
episode number 295/1000
episode number 296/1000
episode number 297/1000
episode number 298/1000
episode number 299/1000
episode number 300/1000
Episode 300/1000: Mean rewards: -87.01, Std: 23.18
episode number 301/1000
episode number 302/1000
episode number 303/1000
episode number 304/1000
episode number 305/1000
episode number 306/1000
episode number 307/1000
episode number 308/1000
episode number 309/1000
episode number 310/1000
episode number 311/1000
episode number 312/1000
episode number 313/1000
episode number 314/1000
episode number 315/1000
episode number 316/1000
episode number 317/1000
episode number 318/1000
episode number 319/1000
episode number 320/1000
episode number 321/1000
episode number 322/1000
episode number 323/1000
episode number 324/1000
episode number 325/1000
Episode 325/1000: Mean rewards: -87.0

In [None]:
#plot_sumup(rewards_list,loss_list,config=config)