Today, we will continue working on the inverted pendulum environment. However, the goal of this lab will be to implement one of the RL agents discussed in this week's lecture. Don't forget to download the new version of the environment under course contents. In the rest of this notebook, a Chainer skeleton code (with missing functionality) for the reinforce agent is provided. You can either work on finalizing the implementation of the reinforce agent or implement a different model of your choice.

In [1]:
import matplotlib.pyplot as plt
import tqdm
import numpy as np
from chainer import Chain
import chainer.links as L
import chainer.functions as F
from chainer.optimizers import Adam
from chainer import Variable
import random
import socket
import struct

%matplotlib inline

In [3]:
# same environment as last week

class Environment:
    def __init__(self, ip = "127.0.0.1", port = 13000):
        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.ip     = ip
        self.port   = port

        self.client.connect((ip, port))

    def reset(self):
        self._send(0, 0)
        return self._receive()

    def step(self, action):
        self._send(action, 1)
        return self._receive()

    def _receive(self):
        data = self.client.recv(19)
        reward = data[0]
        state = [struct.unpack("@f", data[1 + i * 4: 5 + i * 4])[0] for i in range(4)]
        status = [data[17], data[18]]
        return reward, state, status

    def _send(self, action, command):
        self.client.send(bytes([action, command]))

In [4]:
# Let's define a baseline agent which just emits random actions.


class RandomAgent:
    def __init__(self):
        pass

    def step(self, reward, state):
        return random.randint(0, 1)

Let's run the agent within the environment. (don't forget to start the environment)
if you want to see the agent, also don't forget to enable the camera in the GUI

In [5]:
agent = RandomAgent()
environment = Environment()

In [6]:
"""episode_count = 10
R0 = np.zeros(episode_count)

for i in range(1000):
    reward, state, status = environment.reset()

    while (status[0] == 0):
        action = agent.step(reward, state)
        reward, state, status = environment.step(action)
        R0[i] += reward
        """

'episode_count = 10\nR0 = np.zeros(episode_count)\n\nfor i in range(1000):\n    reward, state, status = environment.reset()\n\n    while (status[0] == 0):\n        action = agent.step(reward, state)\n        reward, state, status = environment.step(action)\n        R0[i] += reward\n        '

In [7]:
# Let's create the REINFORCE agent. We assume that the policy is computed using an MLP with a softmax output.

class MLP(Chain):
    """Multilayer perceptron"""

    def __init__(self, n_output=1, n_hidden=5):
        super(MLP, self).__init__(l1=L.Linear(None, n_hidden), l2=L.Linear(n_hidden, n_output))

    def __call__(self, x):
        return self.l2(F.relu(self.l1(x)))

In [8]:
# A skeleton for the REINFORCEAgent is given. Implement the compute_loss and compute_score functions.

class REINFORCEAgent(object):
    """Agent trained using REINFORCE"""

    def __init__(self, model, optimizer=Adam()):
        self.model = model

        self.optimizer = optimizer
        self.optimizer.setup(self.model)

        # monitor score and reward
        self.rewards = []
        self.scores = []


    def step(self, reward, state):

        # linear outputs reflecting the log action probabilities and the value
        policy = self.model(Variable(np.atleast_2d(np.asarray(state, 'float32'))))

        # generate action according to policy
        p = F.softmax(policy).data

        # normalize p in case tiny floating precision problems occur
        row_sums = p.sum(axis=1)
        p /= row_sums[:, np.newaxis]

        action = np.asarray([np.random.choice(p.shape[1], None, True, p[0])])

        return action, policy


    def compute_loss(self):
        """
        Return loss for this episode based on computed scores and accumulated rewards
        """       
        
        return Variable(sum([sum(self.scores[:p1])*r1 for p1,r1 in zip(np.arange(1,len(self.rewards)+1),self.rewards)]))

    def compute_score(self, action, policy):
        """
        Computes score

        Args:
            action (int):
            policy:

        Returns:
            score
        """
        return F.log(policy[0][action[0]]) + sum(self.scores)


In [9]:
# Now we run the REINFORCE agent within the environment. Note that we update the agent after each episode for simplicity.
# First, we should restart the server from the GUI

environment = Environment()
network = MLP(n_output=2, n_hidden=3)
agent = REINFORCEAgent(network, optimizer=Adam())

episode_count = 1000

    
R = np.zeros(episode_count)


In [10]:
for i in tqdm.trange(episode_count):

    reward, state, status = environment.reset()

    loss = 0
    while True:

        action, policy = agent.step(reward, state)

        reward, state, status = environment.step(action[0])

        # get reward associated with taking the previous action in the previous state
        agent.rewards.append(reward)
        R[i] += reward

        # recompute score function: grad_theta log pi_theta (s_t, a_t)
        agent.scores.append(agent.compute_score(action, policy))

        # we learn at the end of each episode
        if status[0] == 1:
            
            loss += agent.compute_loss()
            
            agent.model.cleargrads()
            loss.backward()
            loss.unchain_backward()
            agent.optimizer.update()

            break

  return utils.force_array(numpy.log(x[0])),



TypeError: numpy.ndarray or cuda.ndarray are expected.
Actual: <class 'chainer.variable.Variable'>

In [None]:
# and we finally plot the accumulated reward per episode

plt.figure()
plt.plot(np.cumsum(R0))
plt.plot(np.cumsum(R))
plt.legend(['Random', 'REINFORCE'])
plt.show()

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))