# REINFORCE

<img src="img/algos/REINFORCE.png" alt="REINFORCE" width="800"/>

In [None]:
import gym
import torch
import pyglet
import random
import minihack

import numpy as np
import torch.nn as nn

from nle import nethack
from collections import deque
from minihack import RewardManager
from torch.autograd import Variable
from torch.distributions import Categorical
from gym.envs.classic_control import rendering

In [None]:
# constants
# ---------

SEED = 0 # random seed value

# actions for the environment
MOVE_ACTIONS = tuple(nethack.CompassDirection)
NAVIGATE_ACTIONS = MOVE_ACTIONS + (
    nethack.Command.OPEN,
    nethack.Command.KICK,
    nethack.Command.SEARCH,
)

# maximum number of steps per episode
MAX_EPISODE_STEPS = 1000

In [None]:
# if there is a Cuda GPU, then we want to use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# wrapper for rendering the env as an image
class RenderingWrapper(gym.Wrapper):

    def __init__(self, env):
        super().__init__(env)
        self.env = env
        self.viewer = rendering.SimpleImageViewer()
        self.viewer.width = 1280
        self.viewer.height = 520
        self.viewer.window = pyglet.window.Window(
            width=self.viewer.width, 
            height=self.viewer.height,
            display=self.viewer.display, 
            vsync=False, 
            resizable=True
        )

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.pixels = obs['pixel']
        return obs['glyphs_crop'], reward, done, info

    def render(self, mode="human", **kwargs):
        if mode == 'human':
            self.viewer.imshow(self.pixels)
            return self.viewer.isopen
        else:
            return self.env.render()

    def reset(self):
        obs = self.env.reset()
        self.pixels = obs['pixel']

        # TODO: make sure this is ok
        return obs['glyphs_crop']

    def close(self):
        if self.viewer is not None:
            self.viewer.window.close()
            self.viewer.close()
            self.viewer = None

In [None]:
class PolicyValueNetwork(nn.Module):

    def __init__(
        self, 
        input_size=81, 
        hidden_size=128,
        num_hidden_layers=1, 
        policy_output_size=4, 
        value_output_size=1
    ):
        super(PolicyValueNetwork, self).__init__()

        # Setup the layers of the network
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.hidden_layers = nn.ModuleList()

        # Hidden layers
        for i in np.arange(num_hidden_layers - 1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
            self.hidden_layers.append(nn.ReLU())

        # Define simple policy head
        self.policy = nn.Sequential(
            nn.Linear(hidden_size,
            policy_output_size))

        # Define simple value head
        self.value = nn.Sequential(
            nn.Linear(hidden_size,
            value_output_size))

    def forward(self, x):
        
        # print("before: ",x.shape)
        x = torch.flatten(x)
        x = torch.reshape(x, (1,x.shape[0]))
        x = torch.nn.functional.normalize(x, p=2.0, dim=1, eps=1e-12, out=None)
        
        # move forward through the network
        out = self.fc1(x)
        out = self.relu1(out)
        for layer in self.hidden_layers:
            out = layer(out)

        # get the output distribution
        dist = Categorical(logits=self.policy(out))
        value = self.value(out)

        # return the distribution and value
        return dist, value

    def choose_action(self, state):

        # get the state
        state = torch.from_numpy(state)
        state = state.float().to(device).unsqueeze(0)

        # move forward through the network
        # and get the distribution and value
        dist, value = self.forward(Variable(state))

        # choose a random sample action
        action = dist.sample()

        # return the action and the probabilities for the action and the value
        return action.item(), dist.log_prob(action), value

In [None]:
class REINFORCE:
    """
        REINFORCE with learned baseline implementation.

        Parameters:
        -----------

        - env   :   class   :   online model of the environment
        - alpha :   float   :   learning rate
        - gamma :   float   :   discount rate
        - seed  :   int     :   random seed

        References:
        -----------

        -   Sutton and Barto (2020), 
            Reinforcement Learning, An Introduction, 2nd Edition, Chapter 13

    """

    def __init__(
            self, 
            env, 
            policy_model,
            alpha,
            gamma, 
            seed,
            verbose = True
        ):

        # set the properties for the agent
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.policy_model = policy_model.to(device)
        self.verbose = verbose

        # set the random seeds
        self._seed(seed)

        # setup an optimizer
        self.optimizer = torch.optim.Adam(
            policy_model.parameters(), 
            lr = alpha
        )

    def train(self, num_episodes, max_steps):

        # init a queue to hold the scores
        scores = []
        queue_scores = deque(maxlen = 100)

        # for each episode in the range
        for episode in np.arange(start = 1, stop = num_episodes + 1):

            # init the lists to hold the values for this episode
            policy_loss = []
            rewards = []
            probabilities = []
            values = []

            # reset the environment
            state = self.env.reset()

            # follow a trajectory
            for _ in np.arange(start = 1, stop = max_steps + 1 ):

                # select the action from the policy for the given state
                action, probs, value = self.policy_model.choose_action(state)

                # save the probabilities
                probabilities.append(probs)
                values.append(value)

                # take the action in the environment
                state, reward, done, _ = self.env.step(action)

                # save the rewards
                rewards.append(reward)

                # exit if we have reached the end
                if done:
                    break

            # calculate the total rewards we achieved
            total_rewards = sum(rewards)
            
            # save them for later
            scores.append(total_rewards)
            queue_scores.append(total_rewards)

            # compute the returns (baseline)
            G = self._compute_returns(rewards, self.gamma)
            G = torch.from_numpy(G).float().to(device)

            # calculate the loss for the policy
            total_value = torch.cat(values)
            probabilities = torch.cat(probabilities)
            delta = G - total_value

            # loss functions
            policy_loss = -torch.sum(probabilities*delta.detach())
            value_loss = 0.5 * torch.sum(delta**2)
            loss = policy_loss + value_loss # total

            # propogate backwards and proceed to the next step
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.verbose:# and episode % 50 == 0:
                print('Episode {}\tAverage Score: {:.2f}'.format(
                    episode, np.mean(queue_scores)
                ))

        # return the data
        return self.policy_model, scores

    def _compute_returns(self, rewards, gamma):

        # init
        r = 0
        returns = []

        # discount over the trajectory
        for step in reversed(range(len(rewards))):
            r = rewards[step] + gamma * r
            returns.insert(0, r)

        # ensure type 
        returns = np.array(returns)

        # calculate the mean and std
        mean = returns.mean(axis = 0)
        std = returns.std(axis = 0)

        # average and normalize the returns
        returns = (returns - mean) / std

        return returns


    def _seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        self.env.seed(seed)
        random.seed(seed)

In [None]:
# create the environment
# https://minihack.readthedocs.io/en/latest/envs/skills/quest.html

# setup the reward manager
# https://minihack.readthedocs.io/en/latest/getting-started/reward.html?highlight=RewardManager#reward-manager
reward_manager = RewardManager()
reward_manager.add_kill_event("minotaur", reward=10)
reward_manager.add_kill_event("goblin", reward=1)
reward_manager.add_kill_event("jackal", reward=1)
reward_manager.add_kill_event("giant rat", reward=1)

# make the environment
env = gym.make(
    "MiniHack-Quest-Hard-v0",
    actions=NAVIGATE_ACTIONS,
    reward_manager=reward_manager,
    observation_keys=("glyphs", "pixel", "glyphs_crop"),
)
env.seed(SEED)

# wrappers
env = RenderingWrapper(env)

In [8]:
policy_model = PolicyValueNetwork()

agent = REINFORCE(
    env, 
    policy_model=policy_model, 
    alpha=0.02, 
    gamma=0.99, 
    seed=SEED, 
    verbose=True
)

agent.train(
    num_episodes=1000,
    max_steps=100
)

Episode 51	Average Score: -0.50
