In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import seaborn as sns
from pprint import pprint
import datetime
import gym
import numpy as np 

import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.distributions import normal

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("using GPU" if device.type=="cuda" else "using CPU" )
print("Device name: ", torch.cuda.get_device_name())

from ppo import PPO
from memory import Memory

using GPU
Device name:  GeForce RTX 2080 Ti


In [3]:
def reset_config(print_, env_name):
    config = {}
    
    config['env'] = env_name


    config['std'] = 0.001      # constant standard deviation for continuous action space 
    config['gamma'] = 0.99     # Discount rate
    config['lambda'] = 0.95    # GAE parameter
    config['critic'] = {'lr': 1e-3, "hidden":16}
    config['actor'] = {'lr': 1e-3, "hidden":64}
    
    
    config['eps_clipping'] = 0.2 #range : 0.1-0.3 
    config['c1'] = 1      #parameter of the value function loss
    config['c2'] = 1e-3   #entropy parameter --> 1e-4 to 1e-2 
    
    config['epochs'] = 4
    config['max_episodes'] = 1000
    config['max_steps'] = 300
    config['optimize_every'] = 2000
    config['batch_size'] = 10 

    config["solved_reward"] = {'LunarLander-v2':230,
                              'MountainCarContinuous-v0':300,
                              'CartPole-v1':300,
                              'MountainCar-v0':300}
    
    config['seed'] = 42
    
    if print_== True :
        print("Training config : \n")
        pprint(config)
    return config

In [4]:
envs = ['MountainCar-v0','CartPole-v1','MountainCarContinuous-v0','LunarLander-v2']
config = reset_config(True, envs[0])

Training config : 

{'actor': {'hidden': 64, 'lr': 0.001},
 'batch_size': 10,
 'c1': 1,
 'c2': 0.001,
 'critic': {'hidden': 16, 'lr': 0.001},
 'env': 'MountainCar-v0',
 'epochs': 4,
 'eps_clipping': 0.2,
 'gamma': 0.99,
 'lambda': 0.95,
 'max_episodes': 1000,
 'max_steps': 300,
 'optimize_every': 2000,
 'seed': 42,
 'solved_reward': {'CartPole-v1': 300,
                   'LunarLander-v2': 230,
                   'MountainCar-v0': 300,
                   'MountainCarContinuous-v0': 300},
 'std': 0.001}


In [5]:
def training(config):
    
    t1 = datetime.datetime.now()
    episode_count = 0
    timestep_count = 0
    loss_evol = {'loss':[],'dry_loss':[],'entropy':[]}
    rewards_test = []
    
    env = gym.make(config['env'])
    memory = Memory()
    ppo = PPO(config, env, device)

    for ep in range(config["max_episodes"]):
        episode_count +=1
        obs = env.reset()

        for i in range(config["max_steps"]):
            timestep_count +=1 

            memory.observations.append(obs)
            obs_t = torch.from_numpy(obs).float().to(device)  
            action = ppo.actorcritic.select_action(obs_t, memory)  
            memory.values.append(ppo.actorcritic.predict(obs_t))

            if ppo.discrete_action_bool: 
                #convert to integer if discrete action space
                action = int(action)

            next_obs, reward, done, _ = env.step(action) 
            memory.dones.append(done) 
            memory.rewards.append(reward) 

            if (timestep_count % config["optimize_every"]) == 0:
                loss_val, dry_loss_val, entrop_val = ppo.update(memory,next_obs)
                
                loss_evol["loss"].append(loss_val)
                loss_evol["dry_loss"].append(dry_loss_val)
                loss_evol["entropy"].append(entrop_val)
                
                memory.clear_memory()

            if done:
                break 
                
        if ep == 1 or (ep > 0 and ep % 25 == 0) or (ep == config["max_episodes"] - 1):
            rewards_test.append(np.array([ppo.test() for _ in range(50)]))
            print(f'Episode {ep}/{config["max_episodes"]}: Mean rewards: {round(rewards_test[-1].mean(), 2)}, Std: {round(rewards_test[-1].std(), 2)}')

    env.close()
    t2 = datetime.datetime.now()
        
        # save rewards
        #r = pd.DataFrame((itertools.chain(*(itertools.product([i], rewards_test[i]) for i in range(len(rewards_test))))), columns=['Episode', 'Reward'])
        #r["Episode"] = r["Episode"]*25
        #r["loss_name"] = self.loss_name # add loss name as label
        
        # Plot
        #sns.lineplot(x="Episode", y="Reward", data=r, ci='sd',color=config["color"][self.loss_name],label=self.loss_name)
        # Total time ellapsed
        #time = t2-t1
        #print(f'The training was done over a total of {episode_count} episodes')
        #print('Total time ellapsed during training : ',time)
        #r["time"]=time
        #loss_evol = pd.DataFrame(loss_evol).astype(float)
        #loss_evol["loss_name"] = self.loss_name
        #loss_evol["Update"] = range(len(loss_evol))
        #return r, loss_evol

# Training

In [6]:
training(config)

Episode 1/1000: Mean rewards: -200.0, Std: 0.0
Episode 25/1000: Mean rewards: -200.0, Std: 0.0
Episode 50/1000: Mean rewards: -200.0, Std: 0.0
Episode 75/1000: Mean rewards: -200.0, Std: 0.0
Episode 100/1000: Mean rewards: -200.0, Std: 0.0
Episode 125/1000: Mean rewards: -200.0, Std: 0.0
Episode 150/1000: Mean rewards: -200.0, Std: 0.0
Episode 175/1000: Mean rewards: -200.0, Std: 0.0
Episode 200/1000: Mean rewards: -200.0, Std: 0.0
Episode 225/1000: Mean rewards: -200.0, Std: 0.0
Episode 250/1000: Mean rewards: -200.0, Std: 0.0
Episode 275/1000: Mean rewards: -200.0, Std: 0.0
Episode 300/1000: Mean rewards: -200.0, Std: 0.0
Episode 325/1000: Mean rewards: -200.0, Std: 0.0


KeyboardInterrupt: 