In [1]:
import marlo
import sys
import logging
logger = logging.getLogger()
logger.disabled = True

sys.path.append("../classes/")

from MinecraftEnvironmentManager import MinecraftEnvironmentManager
from Agent import Agent
from DQN import DQN
from EpsilonGreedyStrategy import EpsilonGreedyStrategy
from QValueCalculator import QValueCalculator
from ReplayMemory import ReplayMemory
from Utils import Utils
from PlotHelper import PlotHelper

from collections import namedtuple
from itertools import count

import torch
import torch.optim as optim
import torch.nn.functional as F

import matplotlib
import matplotlib.pyplot as plt

In [2]:
#import iPython's display module

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display

In [3]:
Experience = namedtuple('Experience',
('state','action','next_state','reward'))

# Hyperparameters

In [7]:
batch_size = 150
gamma = 0.999
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update = 10
memory_size = 100000
lr = 0.001
num_episodes = 1000

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mem = MinecraftEnvironmentManager(marlo, device, 'MarLo-FindTheGoal-v0')
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, mem.num_actions_available(), device)
memory = ReplayMemory(memory_size)
utils = Utils()
plotHelper = PlotHelper()
QValues = QValueCalculator()

INFO:marlo.base_env_builder:params.gameMode : Cannot force survival mode.
INFO:marlo.base_env_builder:params.gameMode : Cannot force survival mode.


In [6]:
mem.launch_environment()

Nothing is listening on port 10000 - will attempt to launch Minecraft from a new terminal.
Giving Minecraft some time to launch... 
. . . . . . . . . . . . . . ok


In [9]:
model_type = "linear"
policy_net = DQN(model_type, mem.get_screen_height(), mem.get_screen_width()).to(device)
target_net = DQN(model_type, mem.get_screen_height(), mem.get_screen_width()).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)

In [10]:
#used to display performance on a chart
def plot(values, moving_avg_period, episode):
    plt.figure(2)
    plt.clf()
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(values)
    moving_avg = get_moving_average(moving_avg_period, values)
    plt.plot(moving_avg)
    plt.pause(0.0001)
    print("Episode", len(values), "\n", moving_avg_period, "episode moving avg:", moving_avg[-1])
    if is_ipython: display.clear_output(wait=True)
def get_moving_average(period, values):
    values = torch.tensor(values, dtype=torch.float)
    if len(values) >= period:
        moving_avg = values.unfold(dimension=0, size=period, step=1).mean(dim=1).flatten(start_dim=0)
        moving_avg = torch.cat((torch.zeros(period-1), moving_avg))
        return moving_avg.numpy()
    else:
        moving_avg = torch.zeros(len(values))
        return moving_avg.numpy()

In [11]:
#what we will use to train the network
Experience = namedtuple(
'Experience',
('state','action','next_state','reward'))

In [12]:
episode_durations = []
for episode in range(num_episodes):
    print('Episode ' + str(episode) + "/" + num_episodes)
    mem.reset()
    state = mem.get_state()
 
    for timestep in count():
        action = agent.select_action(state, policy_net)
        while (action.item() >= mem.num_actions_available()):
            action = agent.select_action(state, policy_net)
        reward = mem.take_action(action)
        next_state = mem.get_state()
        memory.push(Experience(state, action, next_state, reward))
        state = next_state

        if memory.can_provide_sample(batch_size):
            experiences = memory.sample(batch_size)
            states, actions, rewards, next_states = utils.extract_tensors(Experience, experiences)
            current_q_values = QValueCalculator.get_current(policy_net, states, actions)
            next_q_values = QValueCalculator.get_next(batch_size, target_net, next_states)
            target_q_values = (next_q_values * gamma) + rewards

            loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        if mem.done:
            episode_durations.append(timestep)
            plot(episode_durations, 100, episode)
            break

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())
   
  
mem.close()

Episode 0
INFO:marlo.base_env_builder:Waiting for mission to start...
INFO:marlo.base_env_builder:Mission Running
INFO:marlo.base_env_builder:Agent missed 5 observation(s).
INFO:marlo.base_env_builder:Agent missed 2 observation(s).
INFO:marlo.base_env_builder:Agent missed 21 observation(s).
INFO:marlo.base_env_builder:Agent missed 1 observation(s).
INFO:marlo.base_env_builder:Agent missed 1 observation(s).
Traceback (most recent call last):
  File "C:\Users\danie\anaconda3\envs\dissertation\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-b0cdb1827720>", line 11, in <module>
    reward = mem.take_action(action)
  File "../classes\MinecraftEnvironmentManager.py", line 57, in take_action
    _, reward, self.done, _ = self.env.step(action.item())
  File "C:\Users\danie\anaconda3\envs\dissertation\lib\site-packages\marlo\base_env_builder.py", line 906, in step
    return self.step_wra

TypeError: object of type 'NoneType' has no len()