# paltform

In [None]:
!pip install textworld
!pip install textworld[vis]
!pip install gym
!pip install tensorboardX

# LSTM-DRQN code

## config

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
from os.path import join as pjoin
import yaml
config_file = pjoin('./capstone/', 'config_drqn.yaml')
with open(config_file) as reader:
    config = yaml.safe_load(reader)

## Make Games

In [None]:

from google.colab import _shell
_shell.Shell.run_code
def makegamescmd(num,level):
    pattern='tw-make tw-coin_collector --level {} --seed {} --save-overview  --output /content/drive/My\ Drive/capstone/tw_games/twcc_easy_level{}_gamesize100_step50_seed{}_train-v0.ulx'
    games_files=[]
    start=1
    for i in range(start,num+start):
        gamename=pattern.format(level,i,level,i)
        # ! $gamename
        games_files.append('twcc_easy_level{}_gamesize100_step50_seed{}_train-v0.ulx'.format(level,i))
    return games_files
games_files=makegamescmd(100,10)


In [None]:
from google.colab import _shell
# from  google.colab import invokeFunction
_shell.Shell.run_code
def makegamescmd_TEST(num,level):
    pattern='tw-make tw-coin_collector --level {} --seed {} --save-overview  --output /content/drive/My\ Drive/capstone/tw_games_test/twcc_easy_level{}_gamesize100_step50_seed{}_train-v0.ulx'
    games_files=[]
    start=1000
    for i in range(start,num+start):
        gamename=pattern.format(level,i,level,i)
        # ! $gamename
        games_files.append('twcc_easy_level{}_gamesize100_step50_seed{}_train-v0.ulx'.format(level,i))
    return games_files
TEST_games_files=makegamescmd_TEST(20,10)

In [None]:

def testing(env_test,batch_size_test,agent):

    obs, infos = env_test.reset()
    obs=list(obs)
    agent.reset(infos)

    print_command_string, print_rewards = [[] for _ in range(batch_size_test)], [[] for _ in range(batch_size_test)]
    print_interm_rewards = [[] for _ in range(batch_size_test)]

    provide_prev_action = config['general']['provide_prev_action']

    dones = [False] * batch_size_test
    rewards = None
    prev_actions = ["" for _ in range(batch_size_test)] if provide_prev_action else None


    input_description, _ = agent.get_game_step_info(obs, infos, prev_actions)


    curr_ras_hidden, curr_ras_cell = None, None

    while not all(dones):

        v_idx, n_idx, chosen_strings, curr_ras_hidden, curr_ras_cell = agent.generate_one_command(input_description, curr_ras_hidden, curr_ras_cell, epsilon=0.0)
        # print(chosen_strings)
        obs, rewards, dones, infos = env_test.step(chosen_strings)
        obs=list(obs)
        rewards=list(rewards)
        dones=list(dones)



        if provide_prev_action:
            prev_actions = chosen_strings

        for i in range(batch_size_test):
            print_command_string[i].append(chosen_strings[i])
            print_rewards[i].append(rewards[i])
            print_interm_rewards[i].append(infos["intermediate_reward"][i])
        if type(dones) is bool:
            dones = [dones] * batch_size
        agent.rewards.append(rewards)
        agent.dones.append(dones)
        agent.intermediate_rewards.append(infos["intermediate_reward"])

        input_description, _ = agent.get_game_step_info(obs, infos, prev_actions)

    agent.finish()
    R = agent.final_rewards.mean()
    S = agent.step_used_before_done.mean()
    IR = agent.final_intermediate_rewards.mean()

    msg = '====EVAL==== R={:.3f}, IR={:.3f}, S={:.3f}'
    msg = msg.format(R, IR, S)
    print(msg)
    print("\n")
    return (R, IR, S)




## dict2vect

In [None]:
import os
import numpy as np
from os.path import join as pjoin
from nltk.tokenize import word_tokenize as wt

import torch
from torch.autograd import Variable

from textworld.utils import maybe_mkdir
import time


class SlidingAverage(object):
    def __init__(self, name, steps=100):
        self.name = name
        self.steps = steps
        self.t = 0
        self.ns = []
        self.avgs = []

    def add(self, n):
        self.ns.append(n)
        if len(self.ns) > self.steps:
            self.ns.pop(0)
        self.t += 1
        if self.t % self.steps == 0:
            self.avgs.append(self.value)

    @property
    def value(self):
        if len(self.ns) == 0: return 0
        return sum(self.ns) / len(self.ns)

    def __str__(self):
        return "%s=%.4f" % (self.name, self.value)

    def __gt__(self, value): return self.value > value
    def __lt__(self, value): return self.value < value

    def state_dict(self):
        return {'t': self.t,
                'ns': tuple(self.ns),
                'avgs': tuple(self.avgs)}

    def load_state_dict(self, state):
        self.t = state["t"]
        self.ns = list(state["ns"])
        self.avgs = list(state["avgs"])


def to_np(x):
    if isinstance(x, np.ndarray):
        return x
    return x.data.cpu().numpy()


def to_pt(np_matrix, enable_cuda=False, type='long'):
    if type == 'long':
        if enable_cuda:
            return torch.autograd.Variable(torch.from_numpy(np_matrix).type(torch.LongTensor).cuda())
        else:
            return torch.autograd.Variable(torch.from_numpy(np_matrix).type(torch.LongTensor))
    elif type == 'float':
        if enable_cuda:
            return torch.autograd.Variable(torch.from_numpy(np_matrix).type(torch.FloatTensor).cuda())
        else:
            return torch.autograd.Variable(torch.from_numpy(np_matrix).type(torch.FloatTensor))


def get_experiment_dir(config,env_id='twcc_easy_level5_gamesize100',info='*'):
    # env_id = config['general']['env_id']
    exps_dir = config['general']['experiments_dir'] 
    exp_tag = config['general']['experiment_tag']+'_'+ info + '_' + time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())
    exp_dir = pjoin(exps_dir, env_id + "_" + exp_tag)
    return maybe_mkdir(exp_dir)


def dict2list(id2w_dict):
    res = []
    for item in id2w_dict:
        res.append(item)
    return res


def _words_to_ids(words, word2id):
    ids = []
    for word in words:
        try:
            ids.append(word2id[word])
        except KeyError:
            ids.append(1)
    return ids


def preproc(s, str_type='None', lower_case=False):
    s = s.replace("\n", ' ')
    if s.strip() == "":
        return ["nothing"]
    if str_type == 'description':
        s = s.split("=-")[1]
    elif str_type == 'inventory':
        s = s.split("carrying")[1]
        if s[0] == ':':
            s = s[1:]
    elif str_type == 'feedback':
        if "Welcome to Textworld" in s:
            s = s.split("Welcome to Textworld")[1]
        if "-=" in s:
            s = s.split("-=")[0]
    s = s.strip()
    if len(s) == 0:
        return ["nothing"]
    tokens = wt(s)
    if lower_case:
        tokens = [t.lower() for t in tokens]
    return tokens


def max_len(list_of_list):
    return max(map(len, list_of_list))


def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.):
    '''
    FROM KERAS
    Pads each sequence to the same length:
    the length of the longest sequence.
    If maxlen is provided, any sequence longer
    than maxlen is truncated to maxlen.
    Truncation happens off either the beginning (default) or
    the end of the sequence.
    Supports post-padding and pre-padding (default).
    # Arguments
        sequences: list of lists where each element is a sequence
        maxlen: int, maximum length
        dtype: type to cast the resulting sequence.
        padding: 'pre' or 'post', pad either before or after each sequence.
        truncating: 'pre' or 'post', remove values from sequences larger than
            maxlen either in the beginning or in the end of the sequence
        value: float, value to pad the sequences to the desired value.
    # Returns
        x: numpy array with dimensions (number_of_sequences, maxlen)
    '''
    lengths = [len(s) for s in sequences]

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x


## FastUniGRU

In [None]:

class FastUniGRU(torch.nn.Module):
    """
    Adapted from https://github.com/facebookresearch/DrQA/
    now supports:   different rnn size for each layer
                    all zero rows in batch (from time distributed layer, by reshaping certain dimension)
    """

    def __init__(self, ninp, nhids, dropout_between_rnn_layers=0.):
        super(FastUniGRU, self).__init__()
        self.ninp = ninp
        self.nhids = nhids
        self.nlayers = len(self.nhids)
        self.dropout_between_rnn_layers = dropout_between_rnn_layers
        self.stack_rnns()

    def stack_rnns(self):
        rnns = [torch.nn.GRU(self.ninp if i == 0 else self.nhids[i - 1],
                              self.nhids[i],
                              num_layers=1,
                              bidirectional=False) for i in range(self.nlayers)]
        self.rnns = torch.nn.ModuleList(rnns)

    def forward(self, x, mask):

        def pad_(tensor, n):
            if n > 0:
                zero_pad = torch.autograd.Variable(torch.zeros((n,) + tensor.size()[1:]))
                if x.is_cuda:
                    zero_pad = zero_pad.cuda()
                tensor = torch.cat([tensor, zero_pad])
            return tensor

        """
        inputs: x:          batch x time x inp
                mask:       batch x time
        output: encoding:   batch x time x hidden[-1]
        """
        # Compute sorted sequence lengths
        batch_size = x.size(0)
        lengths = mask.data.eq(1).long().sum(1)  # .squeeze()
        _, idx_sort = torch.sort(lengths, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)

        lengths = list(lengths[idx_sort])
        idx_sort = torch.autograd.Variable(idx_sort)
        idx_unsort = torch.autograd.Variable(idx_unsort)

        # Sort x
        x = x.index_select(0, idx_sort)

        # remove non-zero rows, and remember how many zeros
        n_nonzero = np.count_nonzero(lengths)
        n_zero = batch_size - n_nonzero
        if n_zero != 0:
            lengths = lengths[:n_nonzero]
            x = x[:n_nonzero]

        # Transpose batch and sequence dims
        x = x.transpose(0, 1)

        # Pack it up
        rnn_input = torch.nn.utils.rnn.pack_padded_sequence(x, lengths)

        # Encode all layers
        outputs = [rnn_input]
        for i in range(self.nlayers):
            rnn_input = outputs[-1]

            # dropout between rnn layers
            if self.dropout_between_rnn_layers > 0:
                dropout_input = F.dropout(rnn_input.data,
                                          p=self.dropout_between_rnn_layers,
                                          training=self.training)
                rnn_input = torch.nn.utils.rnn.PackedSequence(dropout_input,
                                                              rnn_input.batch_sizes)
            seq, last = self.rnns[i](rnn_input)
            outputs.append(seq)
            if i == self.nlayers - 1:
                # last layer
                last_state = last[0]  # (num_layers * num_directions, batch, hidden_size)
                last_state = last_state[0]  # batch x hidden_size

        # Unpack everything
        for i, o in enumerate(outputs[1:], 1):
            outputs[i] = torch.nn.utils.rnn.pad_packed_sequence(o)[0]
        output = outputs[-1]

        # Transpose and unsort
        output = output.transpose(0, 1)  # batch x time x enc

        # re-padding
        output = pad_(output, n_zero)
        last_state = pad_(last_state, n_zero)

        output = output.index_select(0, idx_unsort)
        last_state = last_state.index_select(0, idx_unsort)

        # Pad up to original batch sequence length
        if output.size(1) != mask.size(1):
            padding = torch.zeros(output.size(0),
                                  mask.size(1) - output.size(1),
                                  output.size(2)).type(output.data.type())
            output = torch.cat([output, torch.autograd.Variable(padding)], 1)

        output = output.contiguous() * mask.unsqueeze(-1)
        return output, last_state, mask



## LSTM-DRQN

In [None]:
import logging
import numpy as np

import torch
import torch.nn.functional as F

import sys
sys.path.append('./capstone/')

from helpers.layers import Embedding, masked_mean, LSTMCell, FastUniLSTM

logger = logging.getLogger(__name__)


class LSTM_DQN(torch.nn.Module):
    model_name = 'lstm_dqn'

    def __init__(self, model_config, word_vocab, verb_map, noun_map, enable_cuda=False):
        super(LSTM_DQN, self).__init__()
        self.model_config = model_config
        self.enable_cuda = enable_cuda
        self.word_vocab_size = 1000  ##len(word_vocab)  #cant get the ob_space ??? need to solved
        self.id2word = word_vocab
        self.n_actions = 2  #len(verb_map)
        self.n_objects = 5  #len(noun_map)
        self.read_config()
        self._def_layers()
        self.init_weights()
        # self.print_parameters()

    def print_parameters(self):
        amount = 0
        for p in self.parameters():
            amount += np.prod(p.size())
        print("total number of parameters: %s" % (amount))
        parameters = filter(lambda p: p.requires_grad, self.parameters())
        amount = 0
        for p in parameters:
            amount += np.prod(p.size())
        print("number of trainable parameters: %s" % (amount))

    def read_config(self):
        # model config
        config = self.model_config[self.model_name]
        self.embedding_size = config['embedding_size']
        self.encoder_rnn_hidden_size = config['encoder_rnn_hidden_size']
        self.action_scorer_hidden_dim = config['action_scorer_hidden_dim']
        self.dropout_between_rnn_layers = config['dropout_between_rnn_layers']

    def _def_layers(self):

        # word embeddings
        self.word_embedding = Embedding(embedding_size=self.embedding_size,
                                        vocab_size=self.word_vocab_size,
                                        enable_cuda=self.enable_cuda)

        # gru encoder
        self.encoder = FastUniGRU(ninp=self.embedding_size,
                                   nhids=self.encoder_rnn_hidden_size,
                                   dropout_between_rnn_layers=self.dropout_between_rnn_layers)

        # Recurrent network for temporal dependencies (a.k.a history).

        self.action_scorer_shared_recurrent = LSTMCell(input_size=self.encoder_rnn_hidden_size[-1],
                                                       hidden_size=self.action_scorer_hidden_dim)

        self.action_scorer_shared = torch.nn.Linear(self.encoder_rnn_hidden_size[-1], self.action_scorer_hidden_dim)
        self.action_scorer_action = torch.nn.Linear(self.action_scorer_hidden_dim, self.n_actions, bias=False)
        self.action_scorer_object = torch.nn.Linear(self.action_scorer_hidden_dim, self.n_objects, bias=False)
        self.fake_recurrent_mask = None

    def init_weights(self):
        torch.nn.init.xavier_uniform_(self.action_scorer_shared.weight.data, gain=1)
        torch.nn.init.xavier_uniform_(self.action_scorer_action.weight.data, gain=1)
        torch.nn.init.xavier_uniform_(self.action_scorer_object.weight.data, gain=1)
        self.action_scorer_shared.bias.data.fill_(0)

    def representation_generator(self, _input_words):
        embeddings, mask = self.word_embedding.forward(_input_words)  # batch x time x emb
        encoding_sequence, _, _ = self.encoder.forward(embeddings, mask)  # batch x time x h
        mean_encoding = masked_mean(encoding_sequence, mask)  # batch x h
        return mean_encoding

    def recurrent_action_scorer(self, state_representation, last_hidden=None, last_cell=None):
        # state representation: batch x input
        # last hidden / last cell: batch x hid
        if self.fake_recurrent_mask is None or self.fake_recurrent_mask.size(0) != state_representation.size(0):
            self.fake_recurrent_mask = torch.autograd.Variable(torch.ones(state_representation.size(0),))
            if self.enable_cuda:
                self.fake_recurrent_mask = self.fake_recurrent_mask.cuda()

        new_h, new_c = self.action_scorer_shared_recurrent.forward(state_representation, self.fake_recurrent_mask, last_hidden, last_cell)
        action_rank = self.action_scorer_action.forward(new_h)  # batch x n_action
        object_rank = self.action_scorer_object.forward(new_h)  # batch x n_object
        return action_rank, object_rank, new_h, new_c

    def action_scorer(self, state_representation):
        hidden = self.action_scorer_shared.forward(state_representation)  # batch x hid
        hidden = F.relu(hidden)  # batch x hid
        action_rank = self.action_scorer_action.forward(hidden)  # batch x n_action
        object_rank = self.action_scorer_object.forward(hidden)  # batch x n_object
        return action_rank, object_rank


## RLagent

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import logging
import numpy as np
from collections import namedtuple
import random
# from matplotlib import pyplot as plt
import torch
import torch.nn.functional as F

# from helpers.model import LSTM_DQN  # using our custom model
from helpers.generic import to_np, to_pt, preproc, _words_to_ids, pad_sequences, max_len
logger = logging.getLogger(__name__)

import gym
import pdb

Transition = namedtuple('Transition', ('observation_id_list', 'v_idx', 'n_idx',
                                       'reward', 'mask', 'done', 'is_final', 'observation_str'))


class ReplayMemory(object):

    def __init__(self, capacity=100000):
        # vanilla replay memory
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    #DRQN 获取历史数据
    def get_batch(self, batch_size, history_size):
        if len(self.memory) <= history_size:
            return None
        res = []
        tried_times = 0
        while len(res) < batch_size:
            tried_times += 1
            if tried_times >= 500:
                break
            idx = np.random.randint(history_size - 1, len(self.memory) - 1)
            # only last frame can be (is_final == True)
            if np.any([item.is_final for item in self.memory[idx - (history_size - 1): idx]]):
                continue
            res.append(self.memory[idx - (history_size - 1): idx + 1])

        if len(res) == 0:
            return None
        res = list(map(list, zip(*res)))  # list (history size) of list (batch) of tuples
        return res

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


class PrioritizedReplayMemory(object):

    def __init__(self, capacity=100000, priority_fraction=0.0):
        # prioritized replay memory
        self.priority_fraction = priority_fraction
        self.alpha_capacity = int(capacity * priority_fraction)
        self.beta_capacity = capacity - self.alpha_capacity
        self.alpha_memory, self.beta_memory = [], []
        self.alpha_position, self.beta_position = 0, 0

    def push(self, is_prior=False, *args):
        """Saves a transition."""
        if is_prior:
            if len(self.alpha_memory) < self.alpha_capacity:
                self.alpha_memory.append(None)
            self.alpha_memory[self.alpha_position] = Transition(*args)
            self.alpha_position = (self.alpha_position + 1) % self.alpha_capacity
        else:
            if len(self.beta_memory) < self.beta_capacity:
                self.beta_memory.append(None)
            self.beta_memory[self.beta_position] = Transition(*args)
            self.beta_position = (self.beta_position + 1) % self.beta_capacity


    #DRQN 
    def _get_batch(self, batch_size, history_size, which_memory):
        if len(which_memory) <= history_size:
            return None
        res = []
        tried_times = 0
        while len(res) < batch_size:
            tried_times += 1
            if tried_times >= 500:
                break
            idx = np.random.randint(history_size - 1, len(which_memory) - 1)
            # only last frame can be (is_final == True)
            if np.any([item.is_final for item in which_memory[idx - (history_size - 1): idx]]):
                continue
            res.append(which_memory[idx - (history_size - 1): idx + 1])

        if len(res) == 0:
            return None
        return res
    #DRQN 
    def get_batch(self, batch_size, history_size):
        from_alpha = min(int(self.priority_fraction * batch_size), len(self.alpha_memory))
        from_beta = min(batch_size - int(self.priority_fraction * batch_size), len(self.beta_memory))
        res = []
        res_alpha = self._get_batch(from_alpha, history_size, self.alpha_memory)
        res_beta = self._get_batch(from_beta, history_size, self.beta_memory)
        if res_alpha is None and res_beta is None:
            return None
        if res_alpha is not None:
            res += res_alpha
        if res_beta is not None:
            res += res_beta
        random.shuffle(res)
        res = list(map(list, zip(*res)))  # list (history size) of list (batch) of tuples
        return res


    def __len__(self):
        return len(self.alpha_memory) + len(self.beta_memory)


class ObservationHistoryCache(object):

    def __init__(self, capacity=1):
        # vanilla replay memory
        self.capacity = capacity
        self.memory = []
        self.reset()

    def push(self, stuff):
        """stuff is list."""
        for i in range(1, self.capacity):
            self.memory[i - 1] = self.memory[i]
        self.memory[-1] = stuff

    def get_all(self):
        res = []
        for b in range(len(self.memory[-1])):
            tmp = []
            for i in range(self.capacity):
                if self.memory[i] == []:
                    continue
                tmp += self.memory[i][b]
            res.append(tmp)
        return res

    def reset(self):
        self.memory = []
        for i in range(self.capacity):
            self.memory.append([])

    def __len__(self):
        return len(self.memory)


class RLAgent(object):
    def __init__(self, config, word_vocab, verb_map, noun_map, replay_memory_capacity=100000, replay_memory_priority_fraction=0.0, load_pretrained=False):
        # print('Creating RL agent...')
        self.use_dropout_exploration = True  # TODO: move to config.
        self.config = config
        self.use_cuda = config['general']['use_cuda']
        self.word_vocab = word_vocab
        self.verb_map = verb_map
        self.noun_map = noun_map
        self.word2id = {}
        for i, w in enumerate(word_vocab):
            self.word2id[w] = i
        self.model = LSTM_DQN(model_config=config["model"],
                              word_vocab=self.word_vocab,
                              verb_map=verb_map,
                              noun_map=noun_map,
                              enable_cuda=self.use_cuda)
        self.action_scorer_hidden_dim = config['model']['lstm_dqn']['action_scorer_hidden_dim']
        if load_pretrained:
            self.load_pretrained_model(config["model"]['global']['pretrained_model_save_path'])
        if self.use_cuda:
            self.model.cuda()
        if replay_memory_priority_fraction > 0.0:
            self.replay_memory = PrioritizedReplayMemory(replay_memory_capacity, priority_fraction=replay_memory_priority_fraction)
        else:
            self.replay_memory = ReplayMemory(replay_memory_capacity)
        self.observation_cache_capacity = config['general']['observation_cache_capacity']
        self.observation_cache = ObservationHistoryCache(self.observation_cache_capacity)

    def load_pretrained_model(self, load_from):
        # load model, if there is any
        print("loading best model------------------------------------------------------------------\n")
        try:
            save_f = open(load_from, 'rb')
            self.model = torch.load(save_f)
        except:
            print("failed...lol")

    def reset(self, infos):
        self.rewards = []
        self.dones = []
        self.intermediate_rewards = []
        self.revisit_counting_rewards = []
        self.observation_cache.reset()

    def get_chosen_strings(self, v_idx, n_idx):
        v_idx_np = to_np(v_idx)
        n_idx_np = to_np(n_idx)
        res_str = []
        for i in range(n_idx_np.shape[0]):
            v, n = self.verb_map[v_idx_np[i]], self.noun_map[n_idx_np[i]]
            res_str.append(self.word_vocab[v] + " " + self.word_vocab[n])
        return res_str

    def choose_random_command(self, verb_rank, noun_rank):
        batch_size = verb_rank.size(0)
        vr, nr = to_np(verb_rank), to_np(noun_rank)

        v_idx, n_idx = [], []
        for i in range(batch_size):
            v_idx.append(np.random.choice(len(vr[i]), 1)[0])
            n_idx.append(np.random.choice(len(nr[i]), 1)[0])
        v_qvalue, n_qvalue = [], []
        for i in range(batch_size):
            v_qvalue.append(verb_rank[i][v_idx[i]])
            n_qvalue.append(noun_rank[i][n_idx[i]])
        v_qvalue, n_qvalue = torch.stack(v_qvalue), torch.stack(n_qvalue)
        v_idx, n_idx = to_pt(np.array(v_idx), self.use_cuda), to_pt(np.array(n_idx), self.use_cuda)
        return v_qvalue, v_idx, n_qvalue, n_idx

    def choose_maxQ_command(self, verb_rank, noun_rank):
        batch_size = verb_rank.size(0)
        vr, nr = to_np(verb_rank), to_np(noun_rank)
        v_idx = np.argmax(vr, -1)
        n_idx = np.argmax(nr, -1)
        v_qvalue, n_qvalue = [], []
        for i in range(batch_size):
            v_qvalue.append(verb_rank[i][v_idx[i]])
            n_qvalue.append(noun_rank[i][n_idx[i]])
        v_qvalue, n_qvalue = torch.stack(v_qvalue), torch.stack(n_qvalue)
        v_idx, n_idx = to_pt(v_idx, self.use_cuda), to_pt(n_idx, self.use_cuda)
        return v_qvalue, v_idx, n_qvalue, n_idx

    def get_ranks(self, input_description,prev_hidden=None, prev_cell=None):

        state_representation = self.model.representation_generator(input_description)
        verb_rank, noun_rank, curr_hidden, curr_cell = self.model.recurrent_action_scorer(state_representation, prev_hidden, prev_cell) # batch x n_verb, batch x n_noun
        # batch x n_action   # batch x n_object
        return verb_rank, noun_rank, curr_hidden, curr_cell

    #根据ob 生成 action （最大maxQ）
    def generate_one_command(self, input_description, prev_hidden=None, prev_cell=None, epsilon=0.2):
        #verb_rank 
        verb_rank, noun_rank, curr_hidden, curr_cell = self.get_ranks(input_description, prev_hidden, prev_cell)  # batch x n_verb, batch x n_noun
        curr_hidden = curr_hidden.detach()
        curr_cell = curr_cell.detach()

        v_qvalue_maxq, v_idx_maxq, n_qvalue_maxq, n_idx_maxq = self.choose_maxQ_command(verb_rank, noun_rank)
        v_qvalue_random, v_idx_random, n_qvalue_random, n_idx_random = self.choose_random_command(verb_rank, noun_rank)

        # random number for epsilon greedy
        rand_num = np.random.uniform(low=0.0, high=1.0, size=(input_description.size(0),))
        less_than_epsilon = (rand_num < epsilon).astype("float32")  # batch  
        greater_than_epsilon = 1.0 - less_than_epsilon
        less_than_epsilon = to_pt(less_than_epsilon, self.use_cuda, type='float')
        greater_than_epsilon = to_pt(greater_than_epsilon, self.use_cuda, type='float')
        less_than_epsilon, greater_than_epsilon = less_than_epsilon.long(), greater_than_epsilon.long()
        v_idx = less_than_epsilon * v_idx_random + greater_than_epsilon * v_idx_maxq
        n_idx = less_than_epsilon * n_idx_random + greater_than_epsilon * n_idx_maxq

        v_idx, n_idx = v_idx.detach(), n_idx.detach()

        # print(v_idx)
        # 转换成str 返回action
        chosen_strings = self.get_chosen_strings(v_idx, n_idx)

        return v_idx, n_idx, chosen_strings, curr_hidden, curr_cell

    def get_game_step_info(self, ob, infos, prev_actions=None):
        # concat d/i/q/f/pf together as one string
        inventory_strings = infos["inventory"]
        inventory_token_list = [preproc(item, str_type='inventory', lower_case=True) for item in inventory_strings]
        inventory_id_list = [_words_to_ids(tokens, self.word2id) for tokens in inventory_token_list]

        # feedback_strings = [info["feedback"] for info in infos]
        feedback_strings = infos["feedback"]
        feedback_token_list = [preproc(item, str_type='feedback', lower_case=True) for item in feedback_strings]
        feedback_id_list = [_words_to_ids(tokens, self.word2id) for tokens in feedback_token_list]

        # quest_strings = [info["objective"] for info in infos]
        quest_strings = infos["objective"]
        quest_token_list = [preproc(item, str_type='None', lower_case=True) for item in quest_strings]
        quest_id_list = [_words_to_ids(tokens, self.word2id) for tokens in quest_token_list]

        if prev_actions is not None:
            prev_action_token_list = [preproc(item, str_type='None', lower_case=True) for item in prev_actions]
            prev_action_id_list = [_words_to_ids(tokens, self.word2id) for tokens in prev_action_token_list]
        else:
            prev_action_id_list = [[] for _ in infos]

        # description_strings = [info["description"] for info in infos]
        description_strings = infos["description"]
        description_token_list = [preproc(item, str_type='description', lower_case=True) for item in description_strings]
        for i, d in enumerate(description_token_list):
            if len(d) == 0:
                description_token_list[i] = ["end"]  # hack here, if empty description, insert word "end"
        description_id_list = [_words_to_ids(tokens, self.word2id) for tokens in description_token_list]
        description_id_list = [_d + _i + _q  + _pa for (_d, _i, _q, _pa) in zip(description_id_list, inventory_id_list, quest_id_list, prev_action_id_list)]
        

        self.observation_cache.push(description_id_list)
        description_with_history_id_list = self.observation_cache.get_all() 

        input_description = pad_sequences(description_with_history_id_list, maxlen=max_len(description_with_history_id_list), padding='post').astype('int32')
        input_description = to_pt(input_description, self.use_cuda)
        # pdb.set_trace()
        return input_description, description_with_history_id_list

    def get_observation_strings(self, infos):
        # concat game_id_d/i/d together as one string
        game_file_names =  ['cc' for info in infos['game']]
        inventory_strings = infos["inventory"]
        description_strings = infos["description"]

        observation_strings = [_n + _d + _i for (_n, _d, _i) in zip(game_file_names, description_strings, inventory_strings)]

        return observation_strings

    def compute_reward(self, revisit_counting_lambda=0.0, revisit_counting=True):
        if len(self.dones) == 1:
            mask = [1.0 for _ in self.dones[-1]]
        else:
            assert len(self.dones) > 1
            mask = [1.0 if not self.dones[-2][i] else 0.0 for i in range(len(self.dones[-1]))]
        mask = np.array(mask, dtype='float32')
        mask_pt = to_pt(mask, self.use_cuda, type='float')

        # self.rewards: list of list, max_game_length x batch_size
        rewards = np.array(self.rewards[-1], dtype='float32')  # batch
        if revisit_counting:
            if len(self.revisit_counting_rewards) > 0:
                rewards += np.array(self.revisit_counting_rewards[-1], dtype='float32') * revisit_counting_lambda
        rewards_pt = to_pt(rewards, self.use_cuda, type='float')
        
        # memory mask: play one more step after done
        if len(self.dones) < 3:
            memory_mask = [1.0 for _ in self.dones[-1]]
        else:
            memory_mask = [1.0 if mask[i] == 1 or ((not self.dones[-3][i]) and self.dones[-2][i]) else 0.0 for i in range(len(self.dones[-1]))]

        return rewards, rewards_pt, mask, mask_pt, memory_mask

    def update(self, replay_batch_size, history_size, update_from=0, discount_gamma=0.0):

        if len(self.replay_memory) < replay_batch_size:
            return None
        transitions = self.replay_memory.get_batch(replay_batch_size, history_size + 1)  # list (history_size + 1) of list (batch) of tuples
        # last transitions is just for computing the last Q function
        if transitions is None:
            return None
        sequences = [Transition(*zip(*batch)) for batch in transitions]

        losses = []
        prev_ras_hidden, prev_ras_cell = None, None  # ras: recurrent action scorer
        observation_id_list = pad_sequences(sequences[0].observation_id_list, maxlen=max_len(sequences[0].observation_id_list), padding='post').astype('int32')
        input_observation = to_pt(observation_id_list, self.use_cuda)
        v_idx = torch.stack(sequences[0].v_idx, 0)  # batch x 1
        n_idx = torch.stack(sequences[0].n_idx, 0)  # batch x 1
        verb_rank, noun_rank, curr_ras_hidden, curr_ras_cell = self.get_ranks(input_observation, prev_ras_hidden, prev_ras_cell)
        v_qvalue, n_qvalue = verb_rank.gather(1, v_idx.unsqueeze(-1)).squeeze(-1), noun_rank.gather(1, n_idx.unsqueeze(-1)).squeeze(-1)  # batch
        prev_qvalue = torch.mean(torch.stack([v_qvalue, n_qvalue], -1), -1)  # batch
        if update_from > 0:
            prev_qvalue, curr_ras_hidden, curr_ras_cell = prev_qvalue.detach(), curr_ras_hidden.detach(), curr_ras_cell.detach()

        for i in range(1, len(sequences)):
            observation_id_list = pad_sequences(sequences[i].observation_id_list, maxlen=max_len(sequences[i].observation_id_list), padding='post').astype('int32')
            input_observation = to_pt(observation_id_list, self.use_cuda)
            v_idx = torch.stack(sequences[i].v_idx, 0)  # batch x 1
            n_idx = torch.stack(sequences[i].n_idx, 0)  # batch x 1

            verb_rank, noun_rank, curr_ras_hidden, curr_ras_cell = self.get_ranks(input_observation, curr_ras_hidden, curr_ras_cell)
            # max
            v_qvalue_max, _, n_qvalue_max, _ = self.choose_maxQ_command(verb_rank, noun_rank)
            q_value_max = torch.mean(torch.stack([v_qvalue_max, n_qvalue_max], -1), -1)  # batch
            q_value_max = q_value_max.detach()
            # from memory
            v_qvalue, n_qvalue = verb_rank.gather(1, v_idx.unsqueeze(-1)).squeeze(-1), noun_rank.gather(1, n_idx.unsqueeze(-1)).squeeze(-1)  # batch
            q_value = torch.mean(torch.stack([v_qvalue, n_qvalue], -1), -1)  # batch
            if i < update_from or i == len(sequences) - 1:
                q_value, curr_ras_hidden, curr_ras_cell = q_value.detach(), curr_ras_hidden.detach(), curr_ras_cell.detach()
            if i > update_from:
                prev_rewards = torch.stack(sequences[i - 1].reward)  # batch
                prev_not_done = 1.0 - np.array(sequences[i - 1].done, dtype='float32')  # batch
                prev_not_done = to_pt(prev_not_done, self.use_cuda, type='float')
                prev_rewards = prev_rewards + prev_not_done * q_value_max * discount_gamma  # batch
                prev_mask = torch.stack(sequences[i - 1].mask)  # batch
                prev_loss = F.smooth_l1_loss(prev_qvalue * prev_mask, prev_rewards * prev_mask)
                losses.append(prev_loss)
            prev_qvalue = q_value

        return torch.stack(losses).mean()

    def finish(self):
        # Game has finished.
        # this function does nothing, bust compute values that to be printed out
        self.final_rewards = np.array(self.rewards[-1], dtype='float32')  # batch
        self.final_counting_rewards = np.sum(np.array(self.revisit_counting_rewards), 0)  # batch
        dones = []
        for d in self.dones:
            d = np.array([float(dd) for dd in d], dtype='float32')
            dones.append(d)
        dones = np.array(dones)
        step_used = 1.0 - dones
        self.step_used_before_done = np.sum(step_used, 0)  # batch

        self.final_intermediate_rewards = []
        intermediate_rewards = np.array(self.intermediate_rewards)  # step x batch
        intermediate_rewards = np.transpose(intermediate_rewards, (1, 0))  # batch x step
        for i in range(intermediate_rewards.shape[0]):
            self.final_intermediate_rewards.append(np.sum(intermediate_rewards[i][:int(self.step_used_before_done[i]) + 1]))
        self.final_intermediate_rewards = np.array(self.final_intermediate_rewards)

    def reset_binarized_counter(self, batch_size):
        self.binarized_counter_dict = [{} for _ in range(batch_size)]

    def get_binarized_count(self, observation_strings, update=True):
        batch_size = len(observation_strings)
        count_rewards = []
        for i in range(batch_size):
            concat_string = observation_strings[i]
            if concat_string not in self.binarized_counter_dict[i]:
                self.binarized_counter_dict[i][concat_string] = 0.0
            if update:
                self.binarized_counter_dict[i][concat_string] += 1.0
            r = self.binarized_counter_dict[i][concat_string]
            r = float(r == 1.0)
            count_rewards.append(r)
        return count_rewards

    def state_dict(self):
        return {
            'model': self.model.state_dict(),
            # 'optimizer': self.optimizer.state_dict()
        }

    def load_state_dict(self, state):
        self.model.load_state_dict(state['model'])
        # self.optimizer.load_state_dict(state['optimizer'])


## preparation

In [None]:
import sys
# sys.path.append('/content/drive/My Drive/capstone/PaperCode/TextWorld-Coin-Collector-master/gym_textworld')
from tensorboardX import SummaryWriter
import gym
# import gym_textworld   # Register all textworld environments.
import textworld
import textworld.gym

print('Setting up TextWorld environment...')
infos_to_request = textworld.EnvInfos(admissible_commands=True, description=True,
                                      max_score=True, policy_commands=True,game=True,
                                      inventory=True,intermediate_reward=True,score=True,feedback=True,
                                      objective=True,command_templates=True,
                                      won=True,
                                      )

gamefiles = [ '/content/drive/My Drive/capstone/tw_games/'+i for i in games_files[:1]]

batch_size =len(gamefiles) 
#register the game  
env_id = textworld.gym.register_games(gamefiles,batch_size=batch_size,asynchronous=True,
                                      request_infos=infos_to_request,
                                      max_episode_steps=50)

env = gym.make(env_id)
env.seed(config['general']['random_seed'])
env.reset()


# valid and test env
run_test = True
if run_test:

    infos_to_request = textworld.EnvInfos(admissible_commands=True, description=True,
                                        max_score=True, policy_commands=True,game=True,
                                        inventory=True,intermediate_reward=True,score=True,feedback=True,
                                        objective=True,command_templates=True,
                                        won=True,
                                        )

    gamefiles_test = [ '/content/drive/My Drive/capstone/tw_games_test/'+i for i in TEST_games_files]
    batch_size_test =len(gamefiles_test)
    #register the game  
    env_id_test = textworld.gym.register_games(gamefiles_test,batch_size=batch_size_test,asynchronous=True,
                                        request_infos=infos_to_request,
                                        max_episode_steps=200)
    # Create a Gym environment to play the text game.

    env_test = gym.make(env_id_test)
    env_test.seed(config['general']['random_seed'])
    env_test.reset()

print('Fished TextWorld environment...')
# Set the random seed manually for reproducibility.
np.random.seed(config['general']['random_seed'])
torch.manual_seed(config['general']['random_seed'])
if torch.cuda.is_available():
    if not config['general']['use_cuda']:  
        logger.warning("WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml")
    else:# ok! cuda
        print("using cuda")
        torch.backends.cudnn.deterministic = True
        torch.cuda.manual_seed(config['general']['random_seed'])
else:
    config['general']['use_cuda'] = False  # Disable CUDA.


#DRQN！！！！
history_size = config['general']['history_size']
update_from = config['general']['update_from']

revisit_counting = config['general']['revisit_counting']#True
replay_batch_size = config['general']['replay_batch_size']#32
replay_memory_capacity = config['general']['replay_memory_capacity']#500000 
replay_memory_priority_fraction = config['general']['replay_memory_priority_fraction']# 0.25  # 0.0 to disable this

import textworld.text_utils
vocab = textworld.text_utils.extract_vocab_from_gamefiles(gamefiles)
vocab = sorted(vocab)

word2id = {}
for i, w in enumerate(vocab):
        word2id[w] = i

word_vocab=dict2list(word2id)

# collect all nouns
verb_list = ["go", "take"]
object_name_list = ["east", "west", "north", "south", "coin"]
verb_map = [word2id[w] for w in verb_list if w in word2id]
noun_map = [word2id[w] for w in object_name_list if w in word2id]
agent = RLAgent(config, word_vocab, verb_map, noun_map,
                replay_memory_capacity=replay_memory_capacity, replay_memory_priority_fraction=replay_memory_priority_fraction)


exp_dir = get_experiment_dir(config,'twcc_easy_level10_gamesize100','GRU')
summary = SummaryWriter(exp_dir)


init_learning_rate = config['training']['optimizer']['learning_rate']

#换用 优化器
parameters = filter(lambda p: p.requires_grad, agent.model.parameters())
if config['training']['optimizer']['step_rule'] == 'sgd':
    optimizer = torch.optim.SGD(parameters, lr=init_learning_rate)
elif config['training']['optimizer']['step_rule'] == 'adam':
    optimizer = torch.optim.Adam(parameters, lr=init_learning_rate)


log_every = 100
reward_avg = SlidingAverage('reward avg', steps=log_every)
step_avg = SlidingAverage('step avg', steps=log_every)
loss_avg = SlidingAverage('loss avg', steps=log_every)


# save & reload checkpoint only in 0th agent
best_avg_reward = -10000
best_avg_step = 10000

# step penalty
discount_gamma = config['general']['discount_gamma']
provide_prev_action = config['general']['provide_prev_action']

# epsilon greedy
epsilon_anneal_epochs = config['general']['epsilon_anneal_epochs']
epsilon_anneal_from = config['general']['epsilon_anneal_from']
epsilon_anneal_to = config['general']['epsilon_anneal_to']

# counting reward
revisit_counting_lambda_anneal_epochs = config['general']['revisit_counting_lambda_anneal_epochs']
revisit_counting_lambda_anneal_from = config['general']['revisit_counting_lambda_anneal_from']
revisit_counting_lambda_anneal_to = config['general']['revisit_counting_lambda_anneal_to']

epsilon = epsilon_anneal_from
revisit_counting_lambda = revisit_counting_lambda_anneal_from

bestStep=200




Setting up TextWorld environment...
Fished TextWorld environment...
using cuda


## train

In [None]:
import pdb

for epoch in range(1,config['training']['scheduling']['epoch']):

        agent.model.train()

        obs, infos = env.reset()
        obs=list(obs)
        agent.reset(infos)
        print_command_string, print_rewards = [[] for _ in range(batch_size)], [[] for _ in range(batch_size)]  #[[]]
        print_interm_rewards = [[] for _ in range(batch_size)]
        print_rc_rewards = [[] for _ in range(batch_size)]


        dones = [False] * batch_size
        rewards = None
        avg_loss_in_this_game = []

        new_observation_strings = agent.get_observation_strings(infos)

        if revisit_counting:
            agent.reset_binarized_counter(batch_size)
            revisit_counting_rewards = agent.get_binarized_count(new_observation_strings)

        current_game_step = 0
        prev_actions = ["" for _ in range(batch_size)] if provide_prev_action else None  #''

        input_description, description_id_list = agent.get_game_step_info(obs, infos, prev_actions)
        curr_ras_hidden, curr_ras_cell = None, None  # ras: recurrent action scorer

        memory_cache = [[] for _ in range(batch_size)]
        solved = [0 for _ in range(batch_size)]

        while not all(dones):
            agent.model.train()
            v_idx, n_idx, chosen_strings, curr_ras_hidden, curr_ras_cell = agent.generate_one_command(input_description, curr_ras_hidden, curr_ras_cell, epsilon=epsilon)


            obs, rewards, dones, infos = env.step(chosen_strings) 
            obs=list(obs)
            rewards=list(rewards)
            dones=list(dones)
            
            curr_observation_strings = agent.get_observation_strings(infos)

            if provide_prev_action:
                prev_actions = chosen_strings
            # counting
            if revisit_counting:
                revisit_counting_rewards = agent.get_binarized_count(curr_observation_strings, update=True)
            else:
                revisit_counting_rewards = [0.0 for _ in range(batch_size)]
            agent.revisit_counting_rewards.append(revisit_counting_rewards)
            revisit_counting_rewards = [float(format(item, ".3f")) for item in revisit_counting_rewards]

            for i in range(len(infos['game'])):
                print_command_string[i].append(chosen_strings[i])
                print_rewards[i].append(rewards[i])
                print_interm_rewards[i].append(infos["intermediate_reward"][i])
                print_rc_rewards[i].append(revisit_counting_rewards[i])
            if type(dones) is bool:
                dones = [dones] * batch_size
            agent.rewards.append(rewards)
            agent.dones.append(dones)
            agent.intermediate_rewards.append([info for info in infos["intermediate_reward"]])

            # computer rewards, and push into replay memory
            rewards_np, rewards_pt, mask_np, mask_pt, memory_mask = agent.compute_reward(revisit_counting_lambda=revisit_counting_lambda, revisit_counting=revisit_counting)

           
            curr_description_id_list = description_id_list
            input_description, description_id_list = agent.get_game_step_info(obs, infos, prev_actions)
            
            for b in range(batch_size):
                if memory_mask[b] == 0:
                    continue
                if dones[b] == 1 and rewards[b] == 0:
                    # last possible step
                    is_final = True
                else:
                    is_final = mask_np[b] == 0
                if rewards[b] > 0.0:
                    solved[b] = 1
                # replay memory
                memory_cache[b].append((curr_description_id_list[b], v_idx[b], n_idx[b], rewards_pt[b], mask_pt[b], dones[b], is_final, curr_observation_strings[b]))


            
            if current_game_step > 0 and current_game_step % config["general"]["update_per_k_game_steps"] == 0:  # update_per_k_game_steps: 4
                policy_loss = agent.update(replay_batch_size, history_size, update_from, discount_gamma=discount_gamma)
                
                if policy_loss is None:
                    continue
                loss = policy_loss
                # Backpropagate
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                torch.nn.utils.clip_grad_norm_(agent.model.parameters(), config['training']['optimizer']['clip_grad_norm'])
                optimizer.step()  # apply gradients
                avg_loss_in_this_game.append(to_np(policy_loss))
            current_game_step += 1
        
        
        for i, mc in enumerate(memory_cache):
            for item in mc:
                if replay_memory_priority_fraction == 0.0:
                    # vanilla replay memory
                    agent.replay_memory.push(*item)
                else:
                    # prioritized replay memory
                    agent.replay_memory.push(solved[i], *item)


        agent.finish()
        avg_loss_in_this_game = np.mean(avg_loss_in_this_game)
        reward_avg.add(agent.final_rewards.mean())
        step_avg.add(agent.step_used_before_done.mean())
        loss_avg.add(avg_loss_in_this_game)
        # annealing
        if epoch < epsilon_anneal_epochs:
            epsilon -= (epsilon_anneal_from - epsilon_anneal_to) / float(epsilon_anneal_epochs)
        if epoch < revisit_counting_lambda_anneal_epochs:
            revisit_counting_lambda -= (revisit_counting_lambda_anneal_from - revisit_counting_lambda_anneal_to) / float(revisit_counting_lambda_anneal_epochs)

        # Tensorboard logging #
        # (1) Log some numbers
        if (epoch + 1) % config["training"]["scheduling"]["logging_frequency"] == 0:
            summary.add_scalar('avg_reward', reward_avg.value, epoch + 1)
            summary.add_scalar('curr_reward', agent.final_rewards.mean(), epoch + 1)
            summary.add_scalar('curr_interm_reward', agent.final_intermediate_rewards.mean(), epoch + 1)
            summary.add_scalar('curr_counting_reward', agent.final_counting_rewards.mean(), epoch + 1)
            summary.add_scalar('avg_step', step_avg.value, epoch + 1)
            summary.add_scalar('curr_step', agent.step_used_before_done.mean(), epoch + 1)
            summary.add_scalar('loss_avg', loss_avg.value, epoch + 1)
            summary.add_scalar('curr_loss', avg_loss_in_this_game, epoch + 1)

        msg = 'E#{:03d}, R={:.3f}/{:.3f}/IR{:.3f}/CR{:.3f}, S={:.3f}/{:.3f}, L={:.6f}/{:.3f}, epsilon={:.4f}, lambda_counting={:.4f}'
        msg = msg.format(epoch,
                         np.mean(reward_avg.value), agent.final_rewards.mean(), agent.final_intermediate_rewards.mean(), agent.final_counting_rewards.mean(),
                         np.mean(step_avg.value), agent.step_used_before_done.mean(),
                         np.mean(loss_avg.value), avg_loss_in_this_game,
                         epsilon, revisit_counting_lambda)
        if (epoch + 1) % config["training"]["scheduling"]["logging_frequency"] == 0:
            print("=========================================================")
            # for prt_cmd, prt_rew, prt_int_rew, prt_rc_rew in zip(print_command_string, print_rewards, print_interm_rewards, print_rc_rewards):
            #     print("------------------------------")
            #     print(prt_cmd)
            #     print(prt_rew)
            #     print(prt_int_rew)
            #     print(prt_rc_rew)
        print(msg)
        # test on a different set of games
        if run_test and (epoch + 1) % config["training"]["scheduling"]["logging_frequency"] == 0:

            # agent.model.eval()
            print('*********** Valid part: ************')
            valid_R, valid_IR, valid_S = testing(env,batch_size,agent)
            summary.add_scalar('valid_reward', valid_R, epoch + 1)
            summary.add_scalar('valid_interm_reward', valid_IR, epoch + 1)
            summary.add_scalar('valid_step', valid_S, epoch + 1)


            print('*********** Test part: ************')
            # save & reload checkpoint by best valid performance
            R, IR, S=testing(env_test,batch_size_test,agent)   
            summary.add_scalar('test_reward_', R, epoch + 1)
            summary.add_scalar('test_interm_reward_', IR, epoch + 1)
            summary.add_scalar('test_step_', S, epoch + 1)
            summary.close()
            
            if bestStep>S or bestStep>valid_S:
                PATH='/content/drive/My Drive/capstone/checkpoint/GRU-DRQN_level10_100multiGames'
                torch.save(agent.state_dict(), PATH)
                torch.save(optimizer.state_dict(), PATH+'_opt')
                bestStep=S

