In [1]:
import os
import shutil
import pommerman
import numpy as np
import tensorflow as tf

from pommerman.agents import BaseAgent, SimpleAgent
from pommerman.configs import ffa_v0_env
from pommerman.constants import BOARD_SIZE
from pommerman.envs.v0 import Pomme
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.utils import to_categorical
# Immitation Learning: learn a mapping from observations to actions.
from pomm_network import PommNetwork

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
batch_size = 16384
epochs = 25
initial_rollouts = 120
num_rollouts = 50
iterations = 100
early_stopping = 6

In [3]:
class Agent:
    def __init__(self, actions, seed=0, save_path="./model/model.cptk", log_path='./logs/', save_best_model=True, learning_rate=1e-3):
        self.save_path = save_path
        self.actions = actions
        self.save_best_model = save_best_model
        self.seed = seed
        self.sess = tf.InteractiveSession()
        self.rewards = []
        self.cur_epoch = 0
        if os.path.isdir(log_path):
            try:
                os.removedirs(log_path)
            except:
                print("Cant delete log folder")

        # TODO hardcoded
        self.conv_ph = tf.placeholder(shape=[None, BOARD_SIZE, BOARD_SIZE, 3], name='conv_ph', dtype=tf.float32)
        self.state_ph = tf.placeholder(shape=[None, 3], name='state_ph', dtype=tf.float32)
        self.logits_ph = tf.placeholder(shape=[None, actions], name='logits_ph', dtype=tf.int32)

        network = PommNetwork.create_network({'board': self.conv_ph, 'state': self.state_ph})
        logits = tf.layers.dense(network, actions)
        self.sampled_action = tf.squeeze(tf.multinomial(logits, 1), axis=[1])
        sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits(labels=self.logits_ph, logits=logits)
        self.loss = tf.reduce_mean(sy_logprob_n)  # Loss function that we'll differentiate to get the policy gradient.
        self.train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)
        self.train_writer = tf.summary.FileWriter(log_path + "train", self.sess.graph)
        self.test_writer = tf.summary.FileWriter(log_path + "test")
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        if os.path.isdir(os.path.dirname(self.save_path)):
            try:
                print("Trying to load model")
                self.saver.restore(self.sess, self.save_path)
                print("Model was loaded successful")
            except:
                print("Model load failed")

    def __run_batch(self, X, y, batch_size, training=True):
        accuracies = []
        losses = []
        size = X.shape[-1]
        batches = int(np.ceil(size / batch_size))
        for i in range(batches):
            X_batch, y_batch = None, None
            if i == batches - 1:
                X_batch = X[i * batch_size:]
                y_batch = y[i * batch_size:]
            else:
                X_batch = X[i * batch_size: (i + 1) * batch_size]
                y_batch = y[i * batch_size: (i + 1) * batch_size]
            board = []
            state = []
            for i in range(X_batch.shape[-1]):
                val = self.featurize(X_batch[i])
                board.append(val['board'])
                state.append(val['state'])
            y_batch = np.array(y_batch).reshape(-1)
            feed_dict = {self.conv_ph: board, self.state_ph: state, self.logits_ph: to_categorical(y_batch, self.actions)}
            if training:
                _, loss, actions = self.sess.run([self.train_step, self.loss, self.sampled_action], feed_dict=feed_dict)
            else:
                loss, actions = self.sess.run([self.loss, self.sampled_action], feed_dict=feed_dict)
            accuracies.append(accuracy_score(actions, y_batch))
            losses.append(loss)
        return np.mean(accuracies), np.mean(losses)

    def train(self, obs, labels, batch_size, epochs, early_stopping = 6):
        print("Train the agent with %i training data, batch_size %i, epochs %i" % (obs.shape[0], batch_size, epochs))
        train_obs, val_obs, train_labels, val_labels = train_test_split(obs, labels, test_size=0.2, random_state=self.seed)
        best_loss = np.inf
        not_improved = 0
        for i in range(epochs):
            train_acc, train_loss = self.__run_batch(train_obs, train_labels, batch_size)
            val_acc, val_loss = self.__run_batch(val_obs, val_labels, batch_size, training=False)
            print("Epoch %d: train_acc %f, train_loss %f, test_acc %f, test_loss: %f" % (i + 1, train_acc, train_loss, val_acc, val_loss))

            summary = tf.summary.Summary()
            summary.value.add(tag="Train acc", simple_value=train_acc)
            summary.value.add(tag="Train loss", simple_value=train_loss)
            self.train_writer.add_summary(summary, self.cur_epoch)
            summary.value.add(tag="Val acc", simple_value=val_acc)
            summary.value.add(tag="Val loss", simple_value=val_loss)
            self.test_writer.add_summary(summary, self.cur_epoch)
            self.cur_epoch += 1
            if not_improved > early_stopping:
                print("Early stopping")
                break
            try:
                if self.save_best_model:
                    if val_loss < best_loss:
                        not_improved = 0
                        print("Saving model")
                        self.saver.save(self.sess, self.save_path)
                        print("Model was saved successfully")
                        best_loss = val_loss
                    else:
                        not_improved += 1
                else:
                    print("Saving model")
                    self.saver.save(self.sess, self.save_path)
                    print("Model was saved successfully")
            except:
                print("Failed save model")

    @staticmethod
    def featurize(obs):
        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(res.shape[0], res.shape[1], 1).astype(np.float32)

        board = get_matrix(obs, 'board')
        teammate_position = None
        teammate = obs["teammate"]
        if teammate is not None:
            teammate = teammate.value
            if teammate > 10 and teammate < 15:
                teammate_position = np.argwhere(board == teammate)[0]
        else:
            teammate = None
        # My self - 11
        # Team mate - 12
        # Enemy - 13

        # Everyone enemy
        board[(board > 10) & (board < 15)] = 13
        # I'm not enemy
        my_position = obs['position']
        board[my_position[0], my_position[1], 0] = 11
        # Set teammate
        if teammate_position is not None:
            board[teammate_position[0], teammate_position[1], teammate_position[2]] = 12

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')
        conv_inp = np.concatenate([board, bomb_blast_strength, bomb_life], axis=2)
        state = np.array([obs["ammo"], obs["blast_strength"], obs["can_kick"]]).astype(np.float32)
        return dict(board=conv_inp, state=state)

    def clear_training_history(self):
        self.history = []

    def act(self, obs):
        obs = self.featurize(obs)
        board = np.expand_dims(obs['board'], axis=0)
        state = np.expand_dims(obs['state'], axis=0)
        res = self.sess.run(self.sampled_action, feed_dict={self.conv_ph: board, self.state_ph: state})
        return res

    def record_reward(self, reward):
        self.rewards.append(np.mean(reward))

    def close(self):
        self.sess.close()

In [4]:
# Simple wrapper around policy function to have an act function
class Expert:
    def __init__(self, config):
        self.__agent = SimpleAgent(config)

    def act(self, obs):
        return self.__agent.act(obs, None)

    def record_reward(self, reward):
        pass


class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


# Environment wrapper
class Stimulator:
    def __init__(self, env, config):
        self.env = env
        self.init(config)

    def init(self, config):
        self.env.seed(0)
        # Add 3 random agents
        agents = []
        for agent_id in range(3):
            agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

        # Add TensorforceAgent
        agent_id += 1
        agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
        self.env.set_agents(agents)
        self.env.set_training_agent(agents[-1].agent_id)
        self.env.set_init_game_state(None)

    def stimulate(self, agent, num_rollouts, render):
        returns = []
        observations = []
        actions = []
        for i in range(num_rollouts):
            obs = self.env.reset()[self.env.training_agent]
            done = False
            totalr = 0.
            steps = 0
            while not done:
                if render:
                    self.env.render()

                action = agent.act(obs)
                obs = self.env.get_observations()
                all_actions = self.env.act(obs)
                all_actions.insert(self.env.training_agent, action)
                state, reward, done, _ = self.env.step(all_actions)

                obs = state[self.env.training_agent]
                r = reward[self.env.training_agent]
                totalr += r
                steps += 1

                observations.append(obs)
                actions.append(action)

            print('rollout %i/%i return=%f' % (i + 1, num_rollouts, totalr))
            returns.append(totalr)
        print('Return summary: mean=%f, std=%f' % (np.mean(returns), np.std(returns)))
        agent.record_reward(returns)
        return (np.array(observations), np.array(actions))

    def label_obs(self, expert, obs):
        actions = []
        for o in obs:
            actions.append(expert.act(o))
        return np.array(actions)

In [5]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
states = {
    "board": dict(shape=(BOARD_SIZE, BOARD_SIZE, 3,), type='float'),
    "state": dict(shape=(3,), type='float')
}
agent_dagger = Agent(env.action_space.n)
# Load Expert
expert = Expert(config["agent"](0, config["game_type"]))

# Generate training data
stimulator = Stimulator(env, config)
training_data = stimulator.stimulate(expert, num_rollouts=initial_rollouts, render=False)
# Train DAgger Agent
obs = training_data[0]
labls = training_data[1]
for i in range(2, iterations):
    print("Train with DAgger, iter %i" % i)
    (stimulated_env, _) = stimulator.stimulate(agent_dagger, num_rollouts=num_rollouts, render=False)
    labels = stimulator.label_obs(expert, stimulated_env)
    obs = np.append(obs, stimulated_env, axis=0)
    labls = np.append(labls, labels, axis=0)
    agent_dagger.train(obs, labls, batch_size=batch_size, epochs=epochs, early_stopping=early_stopping)
agent_dagger.close()
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Cant delete log folder
Trying to load model
INFO:tensorflow:Restoring parameters from ./model/model.cptk
Model was loaded successful
rollout 1/120 return=-1.000000
rollout 2/120 return=-1.000000
rollout 3/120 return=-1.000000
rollout 4/120 return=-1.000000
rollout 5/120 return=-1.000000
rollout 6/120 return=-1.000000
rollout 7/120 return=-1.000000
rollout 8/120 return=-1.000000
rollout 9/120 return=1.000000
rollout 10/120 return=-1.000000
rollout 11/120 return=1.000000
rollout 12/120 return=-1.000000
rollout 13/120 return=-1.000000
rollout 14/120 return=-1.000000
rollout 15/120 return=-1.000000
rollout 16/120 return=-1.000000
rollout 17/120 return=-1.000000
rollout 18/120 return=-1.000000
rollout 19/120 return=-1.000000
rollout 20/120 return=1.000000
rollout 21/120 return=-1.000000
rollout 22/120 return=-1.000000
rollout 23/120 return=-1.000000
rollout 24/120 return=-1.000000
rol

rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 re

rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 

rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 

Train the agent with 109733 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.822177, train_loss 0.389930, test_acc 0.774444, test_loss: 0.556122
Saving model
Model was saved successfully
Epoch 2: train_acc 0.826417, train_loss 0.314726, test_acc 0.772136, test_loss: 0.552621
Saving model
Model was saved successfully
Epoch 3: train_acc 0.833752, train_loss 0.291308, test_acc 0.781459, test_loss: 0.552148
Saving model
Model was saved successfully
Epoch 4: train_acc 0.849002, train_loss 0.264888, test_acc 0.776272, test_loss: 0.558774
Epoch 5: train_acc 0.855233, train_loss 0.250406, test_acc 0.781209, test_loss: 0.588486
Epoch 6: train_acc 0.858694, train_loss 0.245655, test_acc 0.784304, test_loss: 0.591841
Epoch 7: train_acc 0.863253, train_loss 0.235710, test_acc 0.774827, test_loss: 0.631118
Epoch 8: train_acc 0.862542, train_loss 0.237524, test_acc 0.776620, test_loss: 0.643035
Epoch 9: train_acc 0.862136, train_loss 0.240962, test_acc 0.777635, test_loss: 0.645302
Ep

rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
{(0, 4): None, (0, 7): None, (0, 10): None, (0, 11): None, (0, 12): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 10): None, (2, 11): None, (3, 7): None, (3, 9): None, (3, 10): None, (3, 11): None, (4, 8): None, (4, 10): None, (5, 9): None, (6, 10): None, (6, 11): None, (7, 11): None} None (0, 12)
{(0, 4): None, (0, 7): None, (0, 10): None, (0, 11): None, (0, 12): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 10): None, (2, 11): None, (3, 7): None, (3, 9): None, (3, 10): None, (3, 11): None, (4, 8): None, (4, 10): None, (5, 9): None, (6, 10): None, (6, 11): None, (7, 11): None} None (0, 12)
{(0, 4): None, (0, 7): None, (0, 10): None, (0, 11): None, (0, 12): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): 

Train the agent with 123021 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.866520, train_loss 0.271013, test_acc 0.809266, test_loss: 0.516386
Saving model
Model was saved successfully
Epoch 2: train_acc 0.868013, train_loss 0.276689, test_acc 0.793543, test_loss: 0.525112
Epoch 3: train_acc 0.864636, train_loss 0.260365, test_acc 0.798385, test_loss: 0.498176
Saving model
Model was saved successfully
Epoch 4: train_acc 0.876090, train_loss 0.226766, test_acc 0.802562, test_loss: 0.504665
Epoch 5: train_acc 0.882799, train_loss 0.216306, test_acc 0.804628, test_loss: 0.511692
Epoch 6: train_acc 0.887251, train_loss 0.202827, test_acc 0.801800, test_loss: 0.526292
Epoch 7: train_acc 0.890268, train_loss 0.196087, test_acc 0.804788, test_loss: 0.533522
Epoch 8: train_acc 0.894721, train_loss 0.181543, test_acc 0.802747, test_loss: 0.537990
Epoch 9: train_acc 0.900225, train_loss 0.172549, test_acc 0.809019, test_loss: 0.553458
Epoch 10: train_acc 0.902980, train_loss 0.1

Model was saved successfully
Epoch 3: train_acc 0.889774, train_loss 0.189043, test_acc 0.838640, test_loss: 0.411794
Saving model
Model was saved successfully
Epoch 4: train_acc 0.893152, train_loss 0.182321, test_acc 0.839294, test_loss: 0.454961
Epoch 5: train_acc 0.892388, train_loss 0.184004, test_acc 0.834524, test_loss: 0.464615
Epoch 6: train_acc 0.892207, train_loss 0.187672, test_acc 0.836746, test_loss: 0.431425
Epoch 7: train_acc 0.892731, train_loss 0.186465, test_acc 0.828946, test_loss: 0.483049
Epoch 8: train_acc 0.889675, train_loss 0.193953, test_acc 0.833862, test_loss: 0.453918
Epoch 9: train_acc 0.891625, train_loss 0.190304, test_acc 0.829937, test_loss: 0.478344
Epoch 10: train_acc 0.893155, train_loss 0.186270, test_acc 0.836118, test_loss: 0.466644
Epoch 11: train_acc 0.897447, train_loss 0.177737, test_acc 0.831802, test_loss: 0.468879
Early stopping
Train with DAgger, iter 16
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.0

Epoch 5: train_acc 0.913918, train_loss 0.149713, test_acc 0.859099, test_loss: 0.374955
Epoch 6: train_acc 0.915330, train_loss 0.148665, test_acc 0.856111, test_loss: 0.414974
Epoch 7: train_acc 0.910218, train_loss 0.156446, test_acc 0.855022, test_loss: 0.393359
Epoch 8: train_acc 0.911981, train_loss 0.153001, test_acc 0.851627, test_loss: 0.409398
Epoch 9: train_acc 0.910106, train_loss 0.160607, test_acc 0.854346, test_loss: 0.388653
Early stopping
Train with DAgger, iter 19
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.00

Epoch 8: train_acc 0.919675, train_loss 0.133090, test_acc 0.871348, test_loss: 0.351870
Epoch 9: train_acc 0.921086, train_loss 0.132563, test_acc 0.872012, test_loss: 0.365437
Epoch 10: train_acc 0.921702, train_loss 0.131704, test_acc 0.870879, test_loss: 0.373146
Epoch 11: train_acc 0.920148, train_loss 0.133277, test_acc 0.871485, test_loss: 0.367631
Early stopping
Train with DAgger, iter 22
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return

rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 ret

rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/

Train the agent with 184029 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.918830, train_loss 0.178464, test_acc 0.887633, test_loss: 0.295737
Saving model
Model was saved successfully
Epoch 2: train_acc 0.920825, train_loss 0.137692, test_acc 0.884517, test_loss: 0.299446
Epoch 3: train_acc 0.923900, train_loss 0.128137, test_acc 0.888055, test_loss: 0.298128
Epoch 4: train_acc 0.926558, train_loss 0.123560, test_acc 0.888346, test_loss: 0.305721
Epoch 5: train_acc 0.928892, train_loss 0.119501, test_acc 0.888452, test_loss: 0.308121
Epoch 6: train_acc 0.930198, train_loss 0.118580, test_acc 0.890232, test_loss: 0.303077
Epoch 7: train_acc 0.931187, train_loss 0.115226, test_acc 0.888053, test_loss: 0.315575
Epoch 8: train_acc 0.931468, train_loss 0.115052, test_acc 0.889108, test_loss: 0.321463
Epoch 9: train_acc 0.930853, train_loss 0.114040, test_acc 0.886926, test_loss: 0.317705
Early stopping
Train with DAgger, iter 30
rollout 1/50 return=-1.000000
rollout 2/50 r

rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 

rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 203938 training data, batch_size 16384, epochs 25
Epoch 1: train_a

rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 215960 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.928235, train_loss 0.146823, test_acc 0.897956, test_loss: 0.269519
Saving model
Model was saved successfully
Epoch 2: train_acc 0.930813, train_loss 0.121554, test_acc 0.898552, test_loss: 0.274669
Epoch 3: trai

Train the agent with 222103 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.934642, train_loss 0.126553, test_acc 0.904534, test_loss: 0.273353
Saving model
Model was saved successfully
Epoch 2: train_acc 0.936625, train_loss 0.110148, test_acc 0.903479, test_loss: 0.272935
Saving model
Model was saved successfully
Epoch 3: train_acc 0.937626, train_loss 0.106818, test_acc 0.903185, test_loss: 0.274049
Epoch 4: train_acc 0.938386, train_loss 0.105753, test_acc 0.900930, test_loss: 0.276644
Epoch 5: train_acc 0.937062, train_loss 0.106670, test_acc 0.904782, test_loss: 0.278631
Epoch 6: train_acc 0.939024, train_loss 0.103580, test_acc 0.901643, test_loss: 0.293028
Epoch 7: train_acc 0.939004, train_loss 0.104968, test_acc 0.904542, test_loss: 0.277886
Epoch 8: train_acc 0.940419, train_loss 0.101520, test_acc 0.902422, test_loss: 0.279649
Epoch 9: train_acc 0.940996, train_loss 0.099876, test_acc 0.902548, test_loss: 0.281310
Epoch 10: train_acc 0.942656, train_loss 0.0

rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 228822 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.939309, train_loss 0.119204, test_acc 0.905642, test_loss: 0.267280
Saving model
Model was saved successfully
Epoch 2: train_acc 0.941860, train_loss 0.099673, test_acc 0.908100, test_loss: 0.264983
Saving model
Model was saved successfully
Epoch 3: train_acc 0.942684, train_loss 0.097013, test_acc 0.906828, test_loss: 0.273284
Epoch 4: train_acc 0.943626, train_loss 0.096318, test_acc 0.906020, test_loss: 0.280204
Epoch 5: train_acc 0.942536, train_loss 0.096881, test_acc 0.903619, test_loss: 0.282701
Epoch 6: train_acc 0.943035, train_loss 0.097

Train the agent with 238282 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.937358, train_loss 0.119167, test_acc 0.907129, test_loss: 0.265638
Saving model
Model was saved successfully
Epoch 2: train_acc 0.939213, train_loss 0.104454, test_acc 0.907221, test_loss: 0.281116
Epoch 3: train_acc 0.941844, train_loss 0.099178, test_acc 0.905750, test_loss: 0.277674
Epoch 4: train_acc 0.943554, train_loss 0.097962, test_acc 0.904796, test_loss: 0.271834
Epoch 5: train_acc 0.942624, train_loss 0.097470, test_acc 0.908556, test_loss: 0.277919
Epoch 6: train_acc 0.943570, train_loss 0.095593, test_acc 0.906796, test_loss: 0.289363
Epoch 7: train_acc 0.944431, train_loss 0.097000, test_acc 0.905526, test_loss: 0.279685
Epoch 8: train_acc 0.943237, train_loss 0.097199, test_acc 0.905664, test_loss: 0.285905
Epoch 9: train_acc 0.943598, train_loss 0.095616, test_acc 0.906594, test_loss: 0.298945
Early stopping
Train with DAgger, iter 46
rollout 1/50 return=-1.000000
rollout 2/50 r

Model was saved successfully
Epoch 2: train_acc 0.942705, train_loss 0.098766, test_acc 0.907923, test_loss: 0.267955
Epoch 3: train_acc 0.943521, train_loss 0.097590, test_acc 0.909918, test_loss: 0.258429
Epoch 4: train_acc 0.944359, train_loss 0.095150, test_acc 0.915034, test_loss: 0.248071
Saving model
Model was saved successfully
Epoch 5: train_acc 0.945724, train_loss 0.092160, test_acc 0.911738, test_loss: 0.268630
Epoch 6: train_acc 0.945574, train_loss 0.093828, test_acc 0.912310, test_loss: 0.274740
Epoch 7: train_acc 0.945080, train_loss 0.094069, test_acc 0.908709, test_loss: 0.274236
Epoch 8: train_acc 0.944818, train_loss 0.094392, test_acc 0.912589, test_loss: 0.255212
Epoch 9: train_acc 0.946761, train_loss 0.092580, test_acc 0.908217, test_loss: 0.269642
Epoch 10: train_acc 0.946303, train_loss 0.092443, test_acc 0.909156, test_loss: 0.278088
Epoch 11: train_acc 0.946928, train_loss 0.091780, test_acc 0.909068, test_loss: 0.277774
Epoch 12: train_acc 0.946812, train_l

Train the agent with 260986 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.929575, train_loss 0.153897, test_acc 0.905991, test_loss: 0.259744
Saving model
Model was saved successfully
Epoch 2: train_acc 0.936786, train_loss 0.110196, test_acc 0.908009, test_loss: 0.260110
Epoch 3: train_acc 0.943195, train_loss 0.096659, test_acc 0.907503, test_loss: 0.265701
Epoch 4: train_acc 0.946296, train_loss 0.091073, test_acc 0.911602, test_loss: 0.262338
Epoch 5: train_acc 0.948378, train_loss 0.087435, test_acc 0.911409, test_loss: 0.277683
Epoch 6: train_acc 0.948772, train_loss 0.087062, test_acc 0.910955, test_loss: 0.286952
Epoch 7: train_acc 0.948614, train_loss 0.086591, test_acc 0.913219, test_loss: 0.279276
Epoch 8: train_acc 0.949926, train_loss 0.085382, test_acc 0.911163, test_loss: 0.274200
Epoch 9: train_acc 0.950135, train_loss 0.083475, test_acc 0.912797, test_loss: 0.272649
Early stopping
Train with DAgger, iter 52
rollout 1/50 return=-1.000000
rollout 2/50 r

Epoch 4: train_acc 0.949942, train_loss 0.082407, test_acc 0.916850, test_loss: 0.252050
Epoch 5: train_acc 0.949608, train_loss 0.081895, test_acc 0.917519, test_loss: 0.253490
Epoch 6: train_acc 0.950887, train_loss 0.082059, test_acc 0.917171, test_loss: 0.253025
Epoch 7: train_acc 0.950380, train_loss 0.081912, test_acc 0.917371, test_loss: 0.266051
Epoch 8: train_acc 0.950372, train_loss 0.081970, test_acc 0.917109, test_loss: 0.268053
Epoch 9: train_acc 0.950364, train_loss 0.082232, test_acc 0.915166, test_loss: 0.264896
Early stopping
Train with DAgger, iter 55
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000

rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 ret

Epoch 8: train_acc 0.953143, train_loss 0.076435, test_acc 0.918823, test_loss: 0.267109
Epoch 9: train_acc 0.953676, train_loss 0.076134, test_acc 0.917476, test_loss: 0.252091
Early stopping
Train with DAgger, iter 60
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 

rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 

rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 311926 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.949879, train_loss 0.094340, test_acc 0.920567, test_loss: 0.239770
Saving model
Model w

rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 322924 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.945261, train_loss 0.105921, test_acc 0.919539, test_loss: 0.245544
Saving model
Model was saved successfully
Epoch 2: train_acc 0.949400, train_loss 0.084883, test_acc 0.919042, test_loss: 0.239609
Saving model
Model was saved successfully
Epoch 3: train_acc 0.952332, train_loss 0.080114, test_acc 0.916651, test_loss: 0.257271
Epoch 4: train_acc 0.951689, train_loss 0.081486, test_acc 0.916618, test_loss: 0.265421
Epoch 5: train_acc 0.950077, train_loss 0.084232, test_acc 0.911988, test_loss: 0.303364
Epoch 6: train_acc 0.949295, train_loss 0.086

{(0, 0): None, (0, 1): None, (0, 3): None, (0, 4): None, (0, 5): None, (0, 6): None, (0, 7): None, (0, 8): None, (0, 9): None, (0, 10): None, (1, 0): None, (1, 1): None, (1, 2): None, (1, 3): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (2, 1): None, (2, 2): None, (2, 3): None, (2, 6): None, (2, 8): None, (2, 9): None, (3, 3): None, (3, 4): None, (3, 6): None, (4, 1): None, (4, 3): None, (4, 4): None, (4, 6): None, (5, 5): None, (6, 1): None, (6, 2): None, (6, 3): None, (6, 4): None} None (0, 3)
{(0, 0): None, (0, 1): None, (0, 3): None, (0, 4): None, (0, 5): None, (0, 6): None, (0, 7): None, (0, 8): None, (0, 9): None, (0, 10): None, (1, 0): None, (1, 1): None, (1, 2): None, (1, 3): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (2, 1): None, (2, 2): None, (2, 3): None, (2, 6): None, (2, 8): None, (2, 9): None, (3, 3): None, (3, 4): None, (3, 6): None, (4, 1): None, (4

Train the agent with 327558 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.930292, train_loss 0.134754, test_acc 0.910017, test_loss: 0.254108
Saving model
Model was saved successfully
Epoch 2: train_acc 0.942402, train_loss 0.098068, test_acc 0.914688, test_loss: 0.244364
Saving model
Model was saved successfully
Epoch 3: train_acc 0.950160, train_loss 0.082880, test_acc 0.918352, test_loss: 0.252107
Epoch 10: train_acc 0.958381, train_loss 0.067191, test_acc 0.921252, test_loss: 0.291704
Early stopping
Train with DAgger, iter 70
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=

{(0, 4): None, (0, 7): None, (1, 2): None, (1, 3): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (2, 3): None, (2, 4): None, (2, 5): None, (2, 7): None, (2, 8): None, (2, 9): None, (2, 10): None, (2, 12): None, (3, 4): None, (3, 5): None, (3, 7): None, (3, 8): None, (3, 9): None, (3, 10): None, (3, 11): None, (3, 12): None, (4, 6): None, (4, 7): None, (4, 9): None, (4, 11): None, (5, 9): None, (5, 11): None, (6, 7): None, (6, 9): None, (6, 10): None, (6, 11): None, (7, 8): None, (7, 9): None, (7, 10): None, (7, 11): None, (8, 9): None, (8, 11): None} None (1, 10)
{(0, 4): None, (0, 7): None, (1, 2): None, (1, 3): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (2, 3): None, (2, 4): None, (2, 5): None, (2, 7): None, (2, 8): None, (2, 9): None, (2, 10): None, (2, 12): None, (3, 4): None, (3, 5): None, (3, 7): None, (3, 8): None, (3, 9): None, (3, 10): None, (3, 11): None, (

Epoch 7: train_acc 0.957710, train_loss 0.069846, test_acc 0.924038, test_loss: 0.261281
Epoch 8: train_acc 0.957340, train_loss 0.070438, test_acc 0.923612, test_loss: 0.252732
Epoch 9: train_acc 0.957434, train_loss 0.071193, test_acc 0.922985, test_loss: 0.251790
Early stopping
Train with DAgger, iter 73
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout 8/50 return=-1.000000
rollout 9/50 return=-1.000000
rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 retu

rollout 10/50 return=-1.000000
rollout 11/50 return=-1.000000
rollout 12/50 return=-1.000000
rollout 13/50 return=-1.000000
rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 

{(0, 7): None, (0, 8): None, (0, 9): None, (0, 10): None, (0, 12): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (2, 5): None, (2, 8): None, (2, 12): None, (3, 5): None, (3, 6): None, (3, 8): None, (3, 9): None, (3, 10): None, (3, 11): None, (3, 12): None, (4, 5): None, (4, 6): None, (4, 7): None, (4, 8): None, (4, 11): None, (4, 12): None, (5, 11): None, (5, 12): None, (6, 7): None, (6, 8): None, (6, 9): None, (6, 10): None, (6, 11): None, (7, 9): None, (7, 11): None, (7, 12): None, (8, 9): None, (8, 10): None, (8, 11): None, (8, 12): None, (9, 10): None, (9, 11): None, (10, 11): None} None (3, 12)
{(0, 7): None, (0, 8): None, (0, 9): None, (0, 10): None, (0, 12): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (2, 5): None, (2, 8): None, (2, 12): None, (3, 5): None, (3, 6): None, (3, 8): None, (3, 9): None, (3, 10): None, (3, 11): None, (3, 12): None, (4, 5): None, (4, 6): None, (4, 7): None, (4, 8): None, (4, 11): None, (4, 12): None, (5, 11): None, (5,

Train the agent with 353559 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.951994, train_loss 0.089670, test_acc 0.923853, test_loss: 0.241954
Saving model
Model was saved successfully
Epoch 2: train_acc 0.956970, train_loss 0.072068, test_acc 0.924181, test_loss: 0.241610
Saving model
Model was saved successfully
Epoch 3: train_acc 0.958717, train_loss 0.067894, test_acc 0.925127, test_loss: 0.247571
Epoch 4: train_acc 0.959747, train_loss 0.066059, test_acc 0.926396, test_loss: 0.262410
Epoch 5: train_acc 0.960654, train_loss 0.065199, test_acc 0.926024, test_loss: 0.276237
Epoch 6: train_acc 0.960220, train_loss 0.065561, test_acc 0.924801, test_loss: 0.275167
Epoch 7: train_acc 0.960617, train_loss 0.065505, test_acc 0.924616, test_loss: 0.261997
Epoch 8: train_acc 0.959788, train_loss 0.066342, test_acc 0.923865, test_loss: 0.264386
Epoch 9: train_acc 0.959575, train_loss 0.068349, test_acc 0.923888, test_loss: 0.277822
Epoch 10: train_acc 0.958598, train_loss 0.0

{(0, 2): None, (0, 4): None, (0, 5): None, (0, 7): None, (0, 8): None, (0, 9): None, (1, 2): None, (1, 3): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 3): None, (2, 6): None, (2, 7): None, (2, 9): None, (2, 10): None, (2, 11): None, (3, 5): None, (3, 6): None, (3, 7): None, (3, 11): None, (4, 5): None, (4, 8): None, (4, 9): None, (4, 10): None, (5, 6): None, (5, 7): None, (5, 9): None, (5, 11): None, (6, 11): None, (7, 8): None, (7, 9): None, (7, 10): None} None (0, 9)
{(0, 2): None, (0, 4): None, (0, 5): None, (0, 7): None, (0, 8): None, (0, 9): None, (1, 2): None, (1, 3): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 3): None, (2, 6): None, (2, 7): None, (2, 9): None, (2, 10): None, (2, 11): None, (3, 5): None, (3, 6): None, (3, 7): None, (3, 11): None, (4, 5): None, (4, 8): None, (4, 9): None, (4, 

Train the agent with 357311 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.953504, train_loss 0.088108, test_acc 0.924503, test_loss: 0.250768
Saving model
Model was saved successfully
Epoch 2: train_acc 0.955918, train_loss 0.071457, test_acc 0.926514, test_loss: 0.244256
Saving model
Model was saved successfully
Epoch 3: train_acc 0.959200, train_loss 0.067244, test_acc 0.926737, test_loss: 0.244129
Saving model
Model was saved successfully
Epoch 4: train_acc 0.960067, train_loss 0.065801, test_acc 0.926394, test_loss: 0.257521
Epoch 5: train_acc 0.960173, train_loss 0.065235, test_acc 0.927326, test_loss: 0.274258
Epoch 6: train_acc 0.960427, train_loss 0.065292, test_acc 0.926851, test_loss: 0.276541
Epoch 7: train_acc 0.959768, train_loss 0.065252, test_acc 0.926437, test_loss: 0.272848
Epoch 8: train_acc 0.960369, train_loss 0.065673, test_acc 0.926604, test_loss: 0.261151
Epoch 9: train_acc 0.960715, train_loss 0.064946, test_acc 0.926279, test_loss: 0.267352
Ep

Model was saved successfully
Epoch 2: train_acc 0.960027, train_loss 0.065485, test_acc 0.930864, test_loss: 0.251342
Epoch 3: train_acc 0.961598, train_loss 0.062625, test_acc 0.930165, test_loss: 0.260608
Epoch 4: train_acc 0.961772, train_loss 0.062131, test_acc 0.930427, test_loss: 0.263929
Epoch 5: train_acc 0.961642, train_loss 0.062177, test_acc 0.929939, test_loss: 0.271203
Epoch 6: train_acc 0.962042, train_loss 0.062844, test_acc 0.930048, test_loss: 0.271636
Epoch 7: train_acc 0.961312, train_loss 0.062943, test_acc 0.929026, test_loss: 0.266261
Epoch 8: train_acc 0.962000, train_loss 0.062711, test_acc 0.929767, test_loss: 0.267814
Epoch 9: train_acc 0.962070, train_loss 0.062329, test_acc 0.929385, test_loss: 0.270428
Early stopping
Train with DAgger, iter 81
rollout 1/50 return=-1.000000
rollout 2/50 return=-1.000000
rollout 3/50 return=-1.000000
rollout 4/50 return=-1.000000
rollout 5/50 return=-1.000000
rollout 6/50 return=-1.000000
rollout 7/50 return=-1.000000
rollout

rollout 14/50 return=-1.000000
rollout 15/50 return=-1.000000
rollout 16/50 return=-1.000000
rollout 17/50 return=-1.000000
rollout 18/50 return=-1.000000
rollout 19/50 return=-1.000000
rollout 20/50 return=-1.000000
rollout 21/50 return=-1.000000
rollout 22/50 return=-1.000000
rollout 23/50 return=-1.000000
rollout 24/50 return=-1.000000
rollout 25/50 return=-1.000000
rollout 26/50 return=-1.000000
rollout 27/50 return=-1.000000
rollout 28/50 return=-1.000000
rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 

rollout 29/50 return=-1.000000
rollout 30/50 return=-1.000000
rollout 31/50 return=-1.000000
rollout 32/50 return=-1.000000
rollout 33/50 return=-1.000000
rollout 34/50 return=-1.000000
rollout 35/50 return=-1.000000
rollout 36/50 return=-1.000000
rollout 37/50 return=-1.000000
rollout 38/50 return=-1.000000
rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 386924 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.960770, train_loss 0.071246, test_acc 0.931732, test_loss: 0.254049
Saving model
Model was saved successfully
Epoch 2: train_acc 0.962594, train_loss 0.060974, test_acc 0.931742, te

rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
{(0, 5): None, (0, 8): None, (0, 9): None, (0, 11): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 5): None, (2, 6): None, (2, 7): None, (2, 11): None, (2, 12): None, (3, 6): None, (3, 8): None, (3, 10): None, (3, 11): None, (4, 7): None, (4, 9): None, (5, 8): None, (5, 9): None, (5, 10): None, (5, 11): None, (6, 10): None, (6, 11): None, (7, 10): None, (7, 11): None, (7, 12): None, (8, 11): None} None (1, 12)
{(0, 5): None, (0, 8): None, (0, 9): None, (0, 11): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 5): None, (2, 6): None, (2, 7): None, (2, 11): None, (2, 12): None, (3, 6): None, (3, 8): None, (3, 10): None, (3, 11): None, (4, 7): None, (4, 9): None, (5, 8): Non

{(0, 5): None, (0, 8): None, (0, 9): None, (0, 11): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 5): None, (2, 6): None, (2, 7): None, (2, 11): None, (2, 12): None, (3, 6): None, (3, 8): None, (3, 10): None, (3, 11): None, (4, 7): None, (4, 9): None, (5, 8): None, (5, 9): None, (5, 10): None, (5, 11): None, (6, 10): None, (6, 11): None, (7, 10): None, (7, 11): None, (7, 12): None, (8, 11): None} None (1, 12)
{(0, 5): None, (0, 8): None, (0, 9): None, (0, 11): None, (1, 4): None, (1, 5): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 5): None, (2, 6): None, (2, 7): None, (2, 11): None, (2, 12): None, (3, 6): None, (3, 8): None, (3, 10): None, (3, 11): None, (4, 7): None, (4, 9): None, (5, 8): None, (5, 9): None, (5, 10): None, (5, 11): None, (6, 10): None, (6, 11): None, (7, 10): None, (7, 11): None, (7, 12): None, (8, 11): None} N

rollout 39/50 return=-1.000000
rollout 40/50 return=-1.000000
rollout 41/50 return=-1.000000
rollout 42/50 return=-1.000000
rollout 43/50 return=-1.000000
rollout 44/50 return=-1.000000
rollout 45/50 return=-1.000000
rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 408051 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.959508, train_loss 0.076433, test_acc 0.932398, test_loss: 0.263712
Saving model
Model was saved successfully
Epoch 2: train_acc 0.962832, train_loss 0.061293, test_acc 0.933000, test_loss: 0.260278
Saving model
Model was saved successfully
Epoch 3: train_acc 0.964836, train_loss 0.057105, test_acc 0.932716, test_loss: 0.276915
Epoch 4: train_acc 0.965131, train_loss 0.056499, test_acc 0.932785, test_loss: 0.278816
Epoch 5: train_acc 0.965859, train_loss 0.055586, test_acc 0.932795, tes

{(0, 2): None, (0, 5): None, (0, 7): None, (0, 9): None, (0, 10): None, (0, 11): None, (0, 12): None, (1, 2): None, (1, 3): None, (1, 4): None, (1, 5): None, (1, 12): None, (2, 4): None, (2, 5): None, (2, 6): None, (2, 7): None, (2, 11): None, (2, 12): None, (3, 4): None, (3, 6): None, (3, 10): None, (3, 11): None, (3, 12): None, (4, 6): None, (4, 8): None, (4, 9): None, (4, 10): None, (4, 11): None, (4, 12): None, (5, 8): None, (5, 10): None, (5, 11): None, (6, 9): None, (6, 11): None, (7, 9): None, (7, 10): None} None (0, 9)
Train the agent with 414778 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.962125, train_loss 0.068788, test_acc 0.933029, test_loss: 0.256902
Saving model
Model was saved successfully
Epoch 2: train_acc 0.963906, train_loss 0.058576, test_acc 0.933017, test_loss: 0.263906
Epoch 3: train_acc 0.965452, train_loss 0.057608, test_acc 0.932355, test_loss: 0.262248
Epoch 4: train_acc 0.965628, train_loss 0.055785, test_acc 0.932752, test_loss: 0.27131

rollout 46/50 return=-1.000000
rollout 47/50 return=-1.000000
rollout 48/50 return=-1.000000
rollout 49/50 return=-1.000000
rollout 50/50 return=-1.000000
Return summary: mean=-1.000000, std=0.000000
Train the agent with 425180 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.960941, train_loss 0.073698, test_acc 0.931291, test_loss: 0.252828
Saving model
Model was saved successfully
Epoch 2: train_acc 0.963949, train_loss 0.059308, test_acc 0.933449, test_loss: 0.258159
Epoch 3: train_acc 0.964818, train_loss 0.056762, test_acc 0.934892, test_loss: 0.261044
Epoch 4: train_acc 0.965829, train_loss 0.055735, test_acc 0.934797, test_loss: 0.285277
Epoch 5: train_acc 0.965820, train_loss 0.056150, test_acc 0.933907, test_loss: 0.309413
Epoch 6: train_acc 0.965860, train_loss 0.056028, test_acc 0.933920, test_loss: 0.307857
Epoch 7: train_acc 0.965560, train_loss 0.056609, test_acc 0.933971, test_loss: 0.294894
Epoch 8: train_acc 0.966399, train_loss 0.055852, test_acc 0.933

Train the agent with 429139 training data, batch_size 16384, epochs 25
Epoch 1: train_acc 0.960105, train_loss 0.077261, test_acc 0.933663, test_loss: 0.252931
Saving model
Model was saved successfully
Epoch 2: train_acc 0.964049, train_loss 0.059286, test_acc 0.935266, test_loss: 0.261972
Epoch 3: train_acc 0.965888, train_loss 0.055792, test_acc 0.935830, test_loss: 0.276925
Epoch 4: train_acc 0.966213, train_loss 0.054622, test_acc 0.934887, test_loss: 0.280509
Epoch 5: train_acc 0.966973, train_loss 0.054519, test_acc 0.935109, test_loss: 0.285005
Epoch 6: train_acc 0.966527, train_loss 0.054878, test_acc 0.934838, test_loss: 0.285556
Epoch 7: train_acc 0.966327, train_loss 0.055554, test_acc 0.932793, test_loss: 0.297169
Epoch 8: train_acc 0.965183, train_loss 0.057797, test_acc 0.933611, test_loss: 0.278429
Epoch 9: train_acc 0.964511, train_loss 0.058924, test_acc 0.932595, test_loss: 0.279724
Early stopping
Train with DAgger, iter 99
rollout 1/50 return=-1.000000
rollout 2/50 r