# REINFORCE

<img src="img/algos/REINFORCE.png" alt="REINFORCE" width="800"/>

In [None]:
import sys

# resolve path for notebook
sys.path.append('../')

In [None]:
import gym
import math
import torch
import pyglet
import random
import minihack

import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from nle import nethack
from collections import deque

from torch.autograd import Variable
from torch.distributions import Categorical


from environments.QuestEnvironment import QuestEnvironment

In [None]:
# if there is a Cuda GPU, then we want to use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class PolicyValueNetwork_v1(nn.Module):

    def __init__(
        self, 
        input_size=81, 
        hidden_size=128,
        num_hidden_layers=2, 
        policy_output_size=4, 
        value_output_size=1
    ):
        super(PolicyValueNetwork_v1, self).__init__()

        # Setup the layers of the network
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.hidden_layers = nn.ModuleList()

        # Hidden layers
        for i in np.arange(num_hidden_layers - 1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
            self.hidden_layers.append(nn.ReLU())

        # Define simple policy head
        self.policy = nn.Sequential(
            nn.Linear(hidden_size,
            policy_output_size))

        # Define simple value head
        self.value = nn.Sequential(
            nn.Linear(hidden_size,
            value_output_size))

    def forward(self, x):
        
        # print("before: ",x.shape)
        x = torch.flatten(x)
        x = torch.reshape(x, (1,x.shape[0]))
        x = torch.nn.functional.normalize(x, p=2.0, dim=1, eps=1e-12, out=None)
        
        # move forward through the network
        out = self.fc1(x)
        out = self.relu1(out)
        for layer in self.hidden_layers:
            out = layer(out)

        # get the output distribution
        dist = Categorical(logits=self.policy(out))
        value = self.value(out)

        # return the distribution and value
        return dist, value

    def choose_action(self, state):

        # get the state
        state = torch.from_numpy(state)
        state = state.float().to(device).unsqueeze(0)

        # move forward through the network
        # and get the distribution and value
        dist, value = self.forward(Variable(state))

        # choose a random sample action
        action = dist.sample()

        # return the action and the probabilities for the action and the value
        return action.item(), dist.log_prob(action), value

In [None]:
class PolicyValueNetwork_v2(nn.Module):

    def __init__(
        self, 
        input_size=81, 
        hidden_size=128,
        num_hidden_layers=3, 
        policy_output_size=4, 
        value_output_size=1
    ):
        super(PolicyValueNetwork_v2, self).__init__()

        self.shared = nn.Linear(input_size, hidden_size)
        self.policy = nn.Linear(hidden_size, policy_output_size)
        self.value = nn.Linear(hidden_size, value_output_size)

    def forward(self, x):

        x = torch.flatten(x)
        x = torch.reshape(x, (1,x.shape[0]))
        x = torch.nn.functional.normalize(x, p=2.0, dim=1, eps=1e-12, out=None)

        out = self.shared(x)
        out = F.relu(out)

        for layer in self.hidden_layers:
            out = layer(out)
        
        logits = self.policy(out)
        value = self.value(out)

        # get the output distribution
        #logits = F.softmax(logits, dim=1)
        dist = Categorical(logits=logits)

        # return the distribution and value
        return dist, value

    def choose_action(self, state):

        # get the state
        state = torch.from_numpy(state)
        state = state.float().to(device).unsqueeze(0)

        # move forward through the network
        # and get the distribution and value
        dist, value = self.forward(Variable(state))

        # choose a random sample action
        action = dist.sample()
        print(f"Action: {action}")

        # return the action and the probabilities for the action and the value
        return action.item(), dist.log_prob(action), value

In [None]:
class REINFORCE:
    """
        REINFORCE with learned baseline implementation.

        Parameters:
        -----------

        - env   :   class   :   online model of the environment
        - alpha :   float   :   learning rate
        - gamma :   float   :   discount rate
        - seed  :   int     :   random seed

        References:
        -----------

        -   Sutton and Barto (2020), 
            Reinforcement Learning, An Introduction, 2nd Edition, Chapter 13

    """

    def __init__(
            self, 
            env, 
            policy_model,
            alpha,
            gamma, 
            seed,
            verbose = True
        ):

        # set the properties for the agent
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.policy_model = policy_model.to(device)
        self.verbose = verbose
        self.visit_counts = dict()
        self.reward_totals = dict()

        # set the random seeds
        self._seed(self.env.seed_value)

        # setup an optimizer
        self.optimizer = torch.optim.Adam(
            policy_model.parameters(), 
            lr = alpha
        )

    def _get_coordinates(self, state):

        col = state['blstats'][0]
        row = state['blstats'][1]

        return tuple([col, row])

    def _get_uct_reward(self, state, reward):
        coords = self._get_coordinates(state)

        if coords not in self.visit_counts:
            self.visit_counts[coords] = 1
            self.reward_totals[coords] = reward
        else:
            self.visit_counts[coords] += 1
            self.reward_totals[coords] += reward

        r = reward
        r += self.reward_totals[coords] / self.visit_counts[coords]
        r += 1. * math.sqrt( \
            (math.log(self.visit_counts[coords])) \
            / \
            self.visit_counts[coords])

        return r


    def train(self, num_episodes, max_steps, render=False):

        # init a queue to hold the scores
        scores = []
        queue_scores = deque(maxlen = 100)

        # for each episode in the range
        for episode in np.arange(start = 1, stop = num_episodes + 1):

            # init the lists to hold the values for this episode
            policy_loss = []
            rewards = []
            probabilities = []
            values = []

            # reset the environment
            state = self.env.reset()
            state = state['glyphs_crop']

            steps = 0

            # follow a trajectory
            for step in np.arange(start = 1, stop = max_steps + 1 ):

                # select the action from the policy for the given state
                action, probs, value = self.policy_model.choose_action(state)

                # save the probabilities
                probabilities.append(probs)
                values.append(value)

                # take the action in the environment
                state, reward, done, _ = self.env.step(action)
                #reward = self._get_uct_reward(state, reward)
                state = state['glyphs_crop']

                # render if required
                if render:
                    self.env.render()

                # save the rewards
                rewards.append(reward)

                # exit if we have reached the end
                if done:
                    steps = step
                    break

            # calculate the total rewards we achieved
            total_rewards = sum(rewards)
            
            # save them for later
            scores.append(total_rewards)
            queue_scores.append(total_rewards)

            # compute the returns (baseline)
            G = self._compute_returns(rewards, self.gamma)
            G = torch.from_numpy(G).float().to(device)

            # calculate the loss for the policy
            total_value = torch.cat(values)
            probabilities = torch.cat(probabilities)
            delta = G - total_value

            # loss functions
            policy_loss = -torch.sum( probabilities * delta.detach() )
            value_loss = 0.5 * torch.sum(delta**2)
            loss = policy_loss + value_loss # total

            # propogate backwards and proceed to the next step
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.verbose:# and episode % 50 == 0:
                print( f"Episode: {episode}, Steps: {steps}, Average Score: {np.mean(queue_scores)}")

        # return the data
        return self.policy_model, scores

    def _compute_returns(self, rewards, gamma):

        # init
        r = 0
        returns = []

        # discount over the trajectory
        for step in reversed(range(len(rewards))):
            r = rewards[step] + gamma * r
            returns.insert(0, r)

        # ensure type 
        returns = np.array(returns)

        # calculate the mean and std
        mean = returns.mean(axis = 0)
        std = returns.std(axis = 0)

        # average and normalize the returns
        if std == 0.0:
            returns = 0.0
        else:
            returns = (returns - mean) / std

        return returns


    def _seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)

In [None]:
from numpy import argmax


class PolicyValueNetwork(nn.Module):

    def __init__(
        self, 
        input_size=81, 
        hidden_size=162,
        num_hidden_layers=3, 
        policy_output_size=4, 
        value_output_size=1
    ):
        super(PolicyValueNetwork, self).__init__()

        self.shared = nn.Linear(input_size, hidden_size)

        self.hidden_layers = nn.ModuleList()

        # Hidden layers
        for i in np.arange(num_hidden_layers - 1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
            self.hidden_layers.append(nn.ReLU())

        self.policy = nn.Linear(hidden_size, policy_output_size)
        self.value = nn.Linear(hidden_size, value_output_size)

    def forward(self, x):

        x = torch.flatten(x)
        x = torch.reshape(x, (1,x.shape[0]))
        x = torch.nn.functional.normalize(x, p=2.0, dim=1, eps=1e-12, out=None)

        out = self.shared(x)
        out = F.relu(out)
        
        logits = self.policy(out)
        value = self.value(out)

        # get the output distribution
        logits = F.softmax(logits, dim=1)
        dist = Categorical(logits=logits)

        # return the distribution and value
        return dist, value

    def choose_action(self, state):

        # get the state
        state = torch.from_numpy(state)
        state = state.float().to(device).unsqueeze(0)

        # move forward through the network
        # and get the distribution and value
        dist, value = self.forward(Variable(state))

        # choose a random sample action
        action = dist.sample()

        # return the action and the probabilities for the action and the value
        return action.item(), dist.log_prob(action), value


MAX_STEPS = 5000

env = QuestEnvironment().create(
    reward_lose = -10,
    reward_win = 10,
    penalty_step = -0.002,
    penalty_time = 0.005,
    max_episode_steps = MAX_STEPS
)

policy_model = PolicyValueNetwork(
    input_size=441,
    policy_output_size=env.action_space.n)

agent = REINFORCE(
    env, 
    policy_model=policy_model, 
    alpha = 0.1,
    gamma = 0.90,
    seed = 0, 
    verbose=True
)

agent.train(
    num_episodes=10000,
    max_steps=MAX_STEPS,
    render=True
)