# PPO and A2C

**Note** : this script is inspired from the 1st assignment (without correction) from the RL course of the MVA master by A. Lazaric and M. Pirotta, on finite MDP and function approximation, which required to complete a partial implementation of A2C for discrete action space. It has been extended to include a different critic and actor architecture, continuous action space, and the clipped and adaptative KL losses required for PPO.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

try :
    import Box2D
except :
    !pip install Box2D
import pickle as pkl

from config import reset_config, get_arguments
from utils import plot_sumup
from ppo import PPOAgent

In [3]:
try :
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd /content/drive/My\ Drive/RL-PPO
except :
    print("Script running locally")

Script running locally


In [8]:
def reset_config(print_=False):
    config = {}
    config['env'] = "BipedalWalker-v3"

    config['std'] = 0.5 # use constant standard deviation for continuous action space (for now)
    config['gamma'] = 0.99 #Discount rate
    config['lambda'] = 1 # parameter of the generalized advantage estimation
    config['lr'] = 0.0003
    config['eps_clipping'] = 0.2 #range : 0.1-0.3
    config['d_targ'] = 0.01
    config['beta_KL'] = 3
    config['c1'] = 1 #paramter of the value function loss
    config['c2'] = 1e-3 #entropy parameter --> 1e-4 to 1e-2
    config["reward_norm"]=False 
    config['epochs'] = 1
    config['max_episodes'] = 1000
    config['max_steps'] = 300
    config['optimize_every'] = 128
    config['batch_size'] = 128
    config["randomize_batch"]=False
    # config['buffer_size'] = 2048 #2048 - 409600 /!\ multiple of the batch size
    config['loss_name'] = ["A2C_loss","adaptative_KL_loss","clipped_loss"][2]
    config['color'] = {"A2C_loss":sns.color_palette("Set2")[0],"adaptative_KL_loss":sns.color_palette("Set2")[1],"clipped_loss":sns.color_palette("Set2")[2]}

    config['seed'] = 42
    config["reset_val"] = None # use to reset the environment with a custom value
    config["solved_reward"] = {'LunarLander-v2':230,
                              'MountainCarContinuous-v0':300,
                              'CartPole-v1':300,
                              'MountainCar-v0':300}
    
    if print_== True :
        print("Training config : \n")
        pprint(config)
    return config


config = reset_config(print_=True)

Training config : 

{'batch_size': 128,
 'beta_KL': 3,
 'c1': 1,
 'c2': 0.001,
 'color': {'A2C_loss': (0.4, 0.7607843137254902, 0.6470588235294118),
           'adaptative_KL_loss': (0.9882352941176471,
                                  0.5529411764705883,
                                  0.3843137254901961),
           'clipped_loss': (0.5529411764705883,
                            0.6274509803921569,
                            0.796078431372549)},
 'd_targ': 0.01,
 'env': 'BipedalWalker-v3',
 'epochs': 1,
 'eps_clipping': 0.2,
 'gamma': 0.99,
 'lambda': 1,
 'loss_name': 'clipped_loss',
 'lr': 0.0003,
 'max_episodes': 1000,
 'max_steps': 300,
 'optimize_every': 128,
 'randomize_batch': False,
 'reset_val': None,
 'reward_norm': False,
 'seed': 42,
 'solved_reward': {'CartPole-v1': 300,
                   'LunarLander-v2': 230,
                   'MountainCar-v0': 300,
                   'MountainCarContinuous-v0': 300},
 'std': 0.5}


# Lunar 

In [9]:
rewards_list = []
loss_list = []
config["epochs"]=1
#for loss in ["clipped_loss","adaptative_KL_loss","A2C_loss"]:
for loss in ["A2C_loss"]:
    print("-----------------"+loss+"-----------------")
    config["loss_name"]=loss
    print(config)
    agent = PPOAgent(config)
    
    rewards, loss = agent.training(config["epochs"], config["optimize_every"], config["max_episodes"], config["max_steps"])
    rewards_list.append(rewards)
    loss_list.append(loss)

-----------------A2C_loss-----------------
{'env': 'BipedalWalker-v3', 'std': 0.5, 'gamma': 0.99, 'lambda': 1, 'lr': 0.0003, 'eps_clipping': 0.2, 'd_targ': 0.01, 'beta_KL': 3, 'c1': 1, 'c2': 0.001, 'reward_norm': False, 'epochs': 1, 'max_episodes': 1000, 'max_steps': 300, 'optimize_every': 128, 'batch_size': 128, 'randomize_batch': False, 'loss_name': 'A2C_loss', 'color': {'A2C_loss': (0.4, 0.7607843137254902, 0.6470588235294118), 'adaptative_KL_loss': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961), 'clipped_loss': (0.5529411764705883, 0.6274509803921569, 0.796078431372549)}, 'seed': 42, 'reset_val': None, 'solved_reward': {'LunarLander-v2': 230, 'MountainCarContinuous-v0': 300, 'CartPole-v1': 300, 'MountainCar-v0': 300}}
Low :  [-1. -1. -1. -1.]
High :  [1. 1. 1. 1.]
Loss :  A2C_loss
here
Nan
Naaaaaaaaan
NAAAAAAAAAAAAAN


AssertionError: r.LengthSquared() > 0.0f

In [None]:
#plot_sumup(rewards_list,loss_list,config=config)

In [None]:
import torch

In [None]:
torch.eye(4)

In [None]:
import gym

In [None]:
env = gym.make("BipedalWalker-v3")

In [None]:
env.action_space

In [None]:
env.observation_space