In [4]:
%%file config.py
import numpy as np
import minirl.neural_nets.nn.init
import minirl.neural_nets.nn.optim
from minirl.neural_nets.nn.init import custom
from minirl.neural_nets.nn.optim import rmsprop

class Config(object):
    def __init__(self, args):
        # Default training settings
        self.init_func = custom
        self.init_config = {
            'function': lambda shape: np.random.randn(shape[0], shape[1]) / np.sqrt(shape[1])
        }
        self.learning_rate = 1e-3
        self.update_rule = rmsprop
        self.grad_clip = True
        self.clip_magnitude = 40.0

        # Default model settings
        self.hidden_size = 200
        self.gamma = 0.99
        self.lambda_ = 1.0
        self.vf_wt = 0.5        # Weight of value function term in the loss
        self.entropy_wt = 0.01  # Weight of entropy term in the loss

        # Override defaults with values from `args`.
        for arg in self.__dict__:
            if arg in args.__dict__:
                self.__setattr__(arg, args.__dict__[arg])

Writing config.py


In [5]:
%%file model.py
from itertools import chain
import numpy
import scipy.signal

import minirl.neural_nets.nn.init
import minirl.neural_nets.core as core
import minirl.neural_nets.numpy as np
from minirl.neural_nets.nn.model import ModelBase
import minirl.neural_nets.numpy as np
from minirl.neural_nets.nn.init import constant

class Agent(ModelBase):
    def __init__(self, input_size, act_space, config):
        super(Agent, self).__init__()
        self.ctx = config.ctx
        self.act_space = act_space
        self.config = config
        self.add_param('fc1', (config.hidden_size, input_size))
        self.add_param('policy_fc_last', (act_space, config.hidden_size))
        self.add_param('vf_fc_last', (1, config.hidden_size))
        self.add_param('vf_fc_last_bias', (1,))

        self._init_params()

        self.optim_configs = {}
        for p in self.param_configs:
            self.optim_configs[p] = {'learning_rate': self.config.learning_rate}

    def forward(self, X):
        a = np.dot(self.params['fc1'], X.T)
        h = np.maximum(0, a)
        logits = np.dot(h.T, self.params['policy_fc_last'].T)
        ps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        ps /= np.sum(ps, axis=1, keepdims=True)
        vs = np.dot(h.T, self.params['vf_fc_last'].T) + self.params['vf_fc_last_bias']
        return ps, vs

    def loss(self, ps, as_, vs, rs, advs):
        ps = np.maximum(1.0e-5, np.minimum(1.0 - 1e-5, ps))
        policy_grad_loss = -np.sum(np.log(ps) * as_ * advs)
        vf_loss = 0.5*np.sum((vs - rs)**2)
        entropy = -np.sum(ps*np.log(ps))
        loss_ = policy_grad_loss + self.config.vf_wt*vf_loss - self.config.entropy_wt*entropy
        return loss_

    def act(self, ps):
        us = numpy.random.uniform(size=ps.shape[0])[:, np.newaxis]
        as_ = (numpy.cumsum(ps.asnumpy(), axis=1) > us).argmax(axis=1)
        return as_

    def train_step(self, env_xs, env_as, env_rs, env_vs):
        # Stack all the observations and actions.
        xs = np.vstack(list(chain.from_iterable(env_xs)))
        as_ = numpy.array(list(chain.from_iterable(env_as)))[:, np.newaxis]
        # One-hot encode the actions.
        buf = numpy.zeros([xs.shape[0], self.act_space])
        as_ = np.onehot_encode(np.array(as_.ravel(), self.ctx), buf).asnumpy()

        # Compute discounted rewards and advantages.
        drs, advs = [], []
        gamma, lambda_ = self.config.gamma, self.config.lambda_
        for i in range(len(env_vs)):
            # Compute discounted rewards with a 'bootstrapped' final value.
            rs_bootstrap = [] if env_rs[i] == [] else env_rs[i] + [env_vs[i][-1]]
            drs.extend(self._discount(rs_bootstrap, gamma)[:-1])

            # Compute advantages using Generalized Advantage Estimation;
            # see eqn. (16) of [Schulman 2016].
            delta_t = env_rs[i] + gamma*numpy.array(env_vs[i][1:]) - numpy.array(env_vs[i][:-1])
            advs.extend(self._discount(delta_t, gamma * lambda_))

        drs = numpy.array(drs)[:, np.newaxis]
        advs = numpy.array(advs)[:, np.newaxis]

        def loss_func(*params):
            ps, vs = self.forward(xs)
            loss_ = self.loss(ps, as_, vs, drs, advs)
            return loss_

        grads = self._forward_backward(loss_func)
        self._update_params(grads)

    def _discount(self, x, gamma):
        return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

    def _forward_backward(self, loss_func):
        param_arrays = list(self.params.values())
        param_keys = list(self.params.keys())
        grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range(len(param_arrays)))
        grad_arrays, loss = grad_and_loss_func(*param_arrays)
        grads = dict(zip(param_keys, grad_arrays))
        if self.config.grad_clip:
            for k, v in grads.iteritems():
                grads[k] = numpy.clip(v, -self.config.clip_magnitude, self.config.clip_magnitude)

        return grads

    def _update_params(self, grads):
        for p, w in self.params.iteritems():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = self.config.update_rule(w, dw, config)
            self.params[p] = next_w
            self.optim_configs[p] = next_config

    def _init_params(self):
        for name, config in self.param_configs.items():
            init_func = constant if name.endswith('bias') else self.config.init_func
            self.params[name] = init_func(config['shape'], self.config.init_config)

Writing model.py


In [None]:
agent = Agent()

In [None]:
import gym
from itertools import count
from minirl import uniAgent
log_interval=100
render_interval = -1
#env = gym.make("LunarLander-v2")
env = gym.make("CartPole-v1")
ob_n = env.observation_space.shape[0]
ac_n = env.action_space.n
agent = uniAgent(ob_n,ac_n,p_alpha=0.001,v_alpha=0.001,algo="ppo",clip=0.2,capacity=10000,batch_size=1000)
def main():
    """Run REINFORCE algorithm to train on the environment"""
    avg_reward = []
    for i_episode in count(1):
        ep_reward = 0
        obs,_ = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            #sprint(obs)
            action,p = agent.act(obs)
            next_obs, reward, done, _,_ = env.step(action)
            ep_reward += reward
            #reinforce.rewards.append(reward)

            
    
            if render_interval != -1 and i_episode % render_interval == 0:
                env.render()

            agent.learn(obs, reward, next_obs)
            if done:
                break
            
            obs=next_obs
        #reinforce.finish_episode()

        if i_episode % log_interval == 0:
            print("Ave reward: {}".format(sum(avg_reward)/len(avg_reward)))
            avg_reward = []

        else:
            avg_reward.append(ep_reward)
            
main()


In [3]:

from minirl.neural_nets.core import grad

def foo(x):
    if x >= 0:
        return x
    else:
        return 2 * x

foo_grad = grad(foo)
print (foo_grad(3))  # should print 1.0
print (foo_grad(-1)) # should print 2.0

True


AttributeError: 'numpy.bool_' object has no attribute 'mark_for_bp'

In [None]:
import minpy
minpy.set_global_policy('only_numpy')