### REINFORCE v.2

![](img/algos/REINFORCE_v2.png)

In [1]:
import sys

# resolve path for notebook
sys.path.append('../')

In [2]:
import gym
import math
import torch
import pyglet
import random
import minihack

import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from nle import nethack
from torch import optim
from collections import deque

from torch.autograd import Variable
from torch.distributions import Categorical


from environments.QuestEnvironment import QuestEnvironment

In [3]:
class PolicyNetwork:

    def __init__(self, env, input_size, hidden_size, output_size):
        
        self.env = env
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Softmax(dim = -1)
        )

    def prepare_state(self, state):

        state = torch.from_numpy(state)
        state = state.float().unsqueeze(0)

        state = torch.flatten(state)
        state = torch.reshape(state, (1, state.shape[0]))
        state = state.detach().numpy()[0]
        #state = torch.nn.functional.normalize(state, p=2.0, dim=1, eps=1e-12, out=None)

        return state

    def forward(self, state):

        state = self.prepare_state(state)
        return self.net(torch.FloatTensor(state))

    def choose_action(self, state):

        probs = self.forward(state)
        probs = probs.detach().numpy()
        action_space = np.arange(self.env.action_space.n)
        action = np.random.choice(action_space, p = probs)
        return action

def discount_rewards(rewards, gamma=0.99):
    r = np.array([gamma**i * rewards[i] 
        for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r[::-1].cumsum()[::-1]
    return r - r.mean()

#OBS_SPACE = 'blstats'
OBS_SPACE = 'glyphs_crop'

def REINFORCE(env, policy_model, num_episodes, max_steps, batch_size, alpha, gamma, render=False):

    optimizer = optim.Adam(policy_model.net.parameters(), lr = alpha)

    total_rewards = []
    batch_rewards = []
    batch_actions = []
    batch_states = []
    batch_counter = 1

    for k in range(1, num_episodes + 1):

        states = []
        rewards = []
        actions = []

        state = env.reset()
        state = state[OBS_SPACE]

        for _ in range(max_steps):

            action = policy_model.choose_action(state)
            
            next_state, reward, done, info = env.step(action)
            next_state = next_state[OBS_SPACE]

            # render if required
            if render:
                env.render()

            states.append(policy_model.prepare_state(state))
            rewards.append(reward)
            actions.append(action)

            state = next_state

            if done:
                break



        
        
        
        
        
        
        # batch
        batch_rewards.extend(discount_rewards(rewards, gamma))
        batch_states.extend(states)
        batch_actions.extend(actions)
        batch_counter += 1
        total_rewards.append(sum(rewards))

        # If batch is complete, update network
        if batch_counter == batch_size:
            optimizer.zero_grad()
            state_tensor = torch.FloatTensor(batch_states)
            reward_tensor = torch.FloatTensor(batch_rewards)
            # Actions are used as indices, must be 
            # LongTensor
            action_tensor = torch.LongTensor(batch_actions)

            # Calculate loss
            pred = policy_model.net(state_tensor)
            logprob = torch.log(pred)
            selected_logprobs = reward_tensor * \
                torch.gather(logprob, 1, action_tensor.unsqueeze(1)).squeeze()
            loss = -selected_logprobs.mean()

            # Calculate gradients
            loss.backward()

            # Apply gradients
            optimizer.step()

            batch_rewards = []
            batch_actions = []
            batch_states = []
            batch_counter = 1

        if k % 1 == 0:
            avg_rewards = np.mean(total_rewards[-100:])
            print( f"Episode {k}.  Avg Rewards: {avg_rewards}" )

    return total_rewards

MAX_STEPS = 5000

env = QuestEnvironment().create(
    reward_lose = -10,
    reward_win = 10,
    penalty_step = -0.002,
    penalty_time = -0.001,
    max_episode_steps = MAX_STEPS
)

if len(env.observation_space.spaces[OBS_SPACE].shape) == 1:
    input_size = env.observation_space.spaces[OBS_SPACE].shape[0]
else:
    input_size = env.observation_space.spaces[OBS_SPACE].shape[0] * \
                env.observation_space.spaces[OBS_SPACE].shape[1]

policy_model = PolicyNetwork(
    env = env,
    input_size = input_size,
    hidden_size = env.observation_space.spaces[OBS_SPACE].shape[0] * 2,
    output_size = env.action_space.n
)

REINFORCE(env, policy_model, num_episodes=1000, max_steps=5000, batch_size=10, alpha=0.01, gamma=0.99, render=True)

Episode 1.  Avg Rewards: -9.998000000000008
Episode 2.  Avg Rewards: -9.998000000000008
Episode 3.  Avg Rewards: -9.998000000000008
Episode 4.  Avg Rewards: -9.998000000000008
Episode 5.  Avg Rewards: -9.998000000000008
Episode 6.  Avg Rewards: -9.998000000000006
Episode 7.  Avg Rewards: -9.998000000000006
Episode 8.  Avg Rewards: -9.998000000000008


  state_tensor = torch.FloatTensor(batch_states)


Episode 9.  Avg Rewards: -9.998000000000008


ValueError: probabilities contain NaN

: 