In [None]:
import sys

# resolve path for notebook
sys.path.append('../')

In [None]:
import gym
import math
import torch
import pyglet
import random
import minihack

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

from nle import nethack
from collections import deque

from torch.autograd import Variable
from torch.distributions import Categorical


from environments.QuestEnvironment import QuestEnvironment

In [None]:
# if there is a Cuda GPU, then we want to use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
num_episodes = 1000
max_steps = 5000
alpha = 0.001
gamma = 0.9999
epsilon = 1e-12
epsilon = 1e-1
OBS_SPACE = 'glyphs_crop'
#OBS_SPACE = 'blstats'
seed = 42

In [None]:
env = QuestEnvironment().create(
    reward_lose = -10,
    reward_win = 10,
    penalty_step = -0.002,
    penalty_time = -0.002,
    max_episode_steps = max_steps,
    seed = seed
)

In [None]:
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
env.seed(seed)

In [None]:
class PolicyValueNetwork:

    def __init__(self, env, alpha):

        if len(env.observation_space.spaces[OBS_SPACE].shape) == 1:
            self.obs_space = env.observation_space.spaces[OBS_SPACE].shape[0]
        else:
            self.obs_space = env.observation_space.spaces[OBS_SPACE].shape[0] * \
                        env.observation_space.spaces[OBS_SPACE].shape[1]

        self.policy_net = torch.nn.Sequential(
            torch.nn.Linear(self.obs_space, self.obs_space*2),
            torch.nn.ReLU(),
            torch.nn.Linear(self.obs_space*2, env.action_space.n),
            torch.nn.Softmax(dim = -1)
        )

        self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr = alpha)

        self.value_net = torch.nn.Sequential(
            torch.nn.Linear(self.obs_space, self.obs_space*2),
            torch.nn.ReLU(),
            torch.nn.Linear(self.obs_space*2, self.obs_space),
            torch.nn.ReLU(),
            torch.nn.Linear(self.obs_space, 1)
        )

        self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr = alpha)


nn = PolicyValueNetwork(env, alpha)

In [None]:
# if len(env.observation_space.spaces[OBS_SPACE].shape) == 1:
#     obs_space = env.observation_space.spaces[OBS_SPACE].shape[0]
# else:
#     obs_space = env.observation_space.spaces[OBS_SPACE].shape[0] * \
#                 env.observation_space.spaces[OBS_SPACE].shape[1]

# nn = torch.nn.Sequential(
#     torch.nn.Linear(obs_space, obs_space*2),
#     torch.nn.ReLU(),

#     # hidden layers
#     #torch.nn.Linear(obs_space*2, obs_space*2),
#     #torch.nn.ReLU(),
#     # torch.nn.Linear(obs_space*3, obs_space*2),
#     # torch.nn.ReLU(),

#     torch.nn.Linear(obs_space*2, env.action_space.n),
#     torch.nn.Softmax(dim = -1)
# )
# optim = torch.optim.Adam(nn.parameters(), lr=alpha)

In [None]:
visit_counts = dict()
coord_rewards = dict()

def get_exploration_reward(state, reward):
    s = state['blstats']

    #return reward
    coords = (int(s[0]), int(s[1]))

    if coords not in visit_counts:
        visit_counts[coords] = 1
        #return 0.001
        coord_rewards[coords] = reward
    else:
        #return 0
        visit_counts[coords] += 1
        coord_rewards[coords] += reward

    r = 0
    r += coord_rewards[coords] / visit_counts[coords]
    r += 0.01 * math.sqrt( \
        (math.log(visit_counts[coords])) \
        / \
        visit_counts[coords])

    return r

In [None]:
def convert_observation(obs):
    obs = obs[OBS_SPACE]
    obs = torch.tensor(obs, dtype=torch.float)  
    obs = torch.flatten(obs)
    obs = torch.reshape(obs, (1, obs.shape[0]))
    obs = torch.nn.functional.normalize(obs, p=2.0, dim=1, eps=epsilon, out=None)
    return obs

In [None]:
# training

for k in range(num_episodes):

    #visit_counts = dict()
    #coord_rewards = dict()

    obs = env.reset()
    obs = convert_observation(obs)
    
    done = False
    Actions, States, Rewards = [], [], []

    for h in range(max_steps):

        probs = nn.policy_net(obs)
        dist = torch.distributions.Categorical(probs = probs)
        action = dist.sample().item()

        # probs = nn(obs)
        # #dist = probs.detach().numpy()
        # #print(dist)
        # # if 0.1 > np.random.uniform():
        # #     action = np.random.choice([*range(env.action_space.n)])
        # # else:
        # dist = torch.distributions.Categorical(probs=probs)        
        # action = dist.sample().item()
        
        print( f"\rEpisode: {k+1}, Step: {h+1}, Action: {action}", end="")

        obs_, rew, done, _ = env.step(action)
        rew = rew + get_exploration_reward(obs_, rew)
        obs_ = convert_observation(obs_)
        env.render()
        
        Actions.append(torch.tensor(action, dtype=torch.int))
        States.append(obs)
        Rewards.append(rew)

        if done:
            break
        
    # DiscountedReturns = []
    # for t in range(len(Rewards)):
    #     G = 0.0
    #     for i, r in enumerate(Rewards[t:]):
    #         G += (gamma**i)*r
    #     DiscountedReturns.append(G)

    DiscountedReturns = []
    r = 0
    # discount over the trajectory
    for step in reversed(range(len(Rewards))):
        r = Rewards[step] + gamma * r
        DiscountedReturns.insert(0, r)

    # normalize the returns?
    # ensure type 
    DiscountedReturns = np.array(DiscountedReturns)

    # calculate the mean and std
    mean = DiscountedReturns.mean(axis = 0)
    std = DiscountedReturns.std(axis = 0)

    # average and normalize the returns
    if std == 0.0:
        DiscountedReturns = 0.0
    else:
        DiscountedReturns = (DiscountedReturns - mean) / std

    Loss = []
    
    for State, Action, G in zip(States, Actions, DiscountedReturns):

        value = nn.value_net(State)

        delta = G - value

        probs = nn.policy_net(State)
        policy_dist = torch.distributions.Categorical(probs = probs)
        log_probs = policy_dist.log_prob(Action)

        policy_loss = -( delta.detach() * log_probs )
        nn.policy_optimizer.zero_grad()
        policy_loss.backward()
        nn.policy_optimizer.step()

        value_loss = torch.sum(delta**2) / 2
        nn.value_optimizer.zero_grad()
        value_loss.backward()
        nn.value_optimizer.step()
        
        # probs = nn(State)
        # dist = torch.distributions.Categorical(probs=probs)    
        # log_prob = dist.log_prob(Action)
        
        # loss = -(log_prob*G)
        total_loss = policy_loss + value_loss
        Loss.append(total_loss.detach().numpy())
        
        # optim.zero_grad()
        # loss.backward()
        # optim.step()

    print( f", Tot. Rewards: {np.sum(Rewards):0.4f}, Avg. Reward: {np.mean(Rewards):0.4f}, Tot. Loss: {np.sum(Loss):0.4f}, Avg Loss: {np.mean(Loss):0.4f}")

    Loss = np.reshape(Loss, (np.shape(Loss)[0], 1))

    #Plot stuff
    window = int(max_steps * 0.01)
    plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.title('Reward')
    plt.xlabel('episodes')
    plt.ylabel('reward')
    plt.plot(pd.DataFrame(Rewards))
    plt.plot(pd.DataFrame(Rewards).rolling(window).mean())
    plt.grid(True)
    plt.subplot(122)
    plt.title('Loss')
    plt.xlabel('episodes')
    plt.ylabel('loss')
    plt.plot(pd.DataFrame(Loss))
    plt.plot(pd.DataFrame(Loss).rolling(window).mean())
    plt.grid(True)
    plt.show()

In [None]:
for _ in range(5):

    Rewards = []

    obs = env.reset()
    obs = convert_observation(obs)

    done = False
    env.render()

    steps = 0
    
    while not done and steps <= max_steps:

        steps += 1

        probs = nn(obs)

        c = torch.distributions.Categorical(probs=probs)        
        action = c.sample().item()
        
        obs_, rew, done, _info = env.step(action)
        obs_ = convert_observation(obs_)
        env.render()

        Rewards.append(rew)
    

    print(f'Reward: {sum(Rewards)}')

env.close()