In [1]:
import gym
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

import matplotlib.pyplot as plt

from mlp import *
from feedback_env import *
from learning import *

In [None]:
from stable_baselines3.common.callbacks import BaseCallback


class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, data, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: (bool) If the callback returns False, training is aborted early.
        """
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        pass

In [2]:
# Setup configs for reward and policy learning

reward_cfg = {
    'n_sample': 100,
    'n_epoch': 40,
    'lr': 0.001,
    'verbose': True,
    'eval_freq': 100,
    'batch_size': 64,
    'split': 0.8,
    'log': False,
    'action_dim': 1,
    'layers': [5,5,5]
}

policy_cfg = {
    'timesteps': 3000,
    'verbose': False,
    'algo': 'ppo', # not used atm
    'log': False,
    'action_dim': 1
}

In [3]:
# Create comparison function
f = lambda x: x[0]
var_1, var_2 = 0.0, 10.0
reward_fn_true = lambda x : f(x)
seed = None
np.random.seed(seed)
noise_fn = lambda x: step_noise(x[0], x_step=0.8, var_1=var_1, var_2=var_2, seed=seed)
reward_fn_true_noisy = create_reward_fn_1(f, noise_fn, seed=seed)
comparison_fn = create_comparison_fn_1(f, noise_fn, seed=seed)

action_dim = policy_cfg['action_dim']
register_fb_env(reward_fn_true_noisy, action_dim)

if policy_cfg["verbose"]:
    verbose = 1
else:
    verbose = 0

timesteps = policy_cfg["timesteps"]
algo = policy_cfg["algo"]


if verbose: print("Learning with PPO")
env = make_vec_env("FeedbackEnv-v0", n_envs=1)
if policy_cfg["log"]:
    model = PPO("MlpPolicy", env, verbose=verbose, tensorboard_log="./log/policy/"+policy_cfg["log"]+"/")
else:
    model = PPO("MlpPolicy", env, verbose=verbose)
model.learn(total_timesteps=timesteps)

<stable_baselines3.ppo.ppo.PPO at 0x10e944460>