### Libraries

In [None]:
import gym
import textworld.gym
from textworld.generator import make_game, compile_game
from textworld import EnvInfos

import tensorflow as tf
import keras
from keras import layers, Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import numpy as np
import matplotlib.pyplot as plt
import time

### Load Game
Get the first state of the game

In [None]:
# Register a text-based game as a new Gym's environment.

# --- Make random game ---

# options = textworld.GameOptions()
# options.path = "./Random_Games/"
# options.seeds = 432
# game = make_game(options)
# game_file = compile_game(game, options)
# print(game_file)
# -----------------------------


# --- Make random game with quick command ---

# THIS DOES NOT OVERWRITE GAMES WITH EXISTING NAMES!
# !tw-make custom --world-size 2 --nb-objects 3 --theme house --quest-length 3 --entity-numbering --output "./Created_Games/Sample_Game.ulx" --seed 3 --format ulx

# -------------------------------------

# Load Game
def load_game(game_file, max_steps=100):

    request_infos = EnvInfos(inventory=True, admissible_commands=True, entities=True, won=True, lost=True, last_command=True,
                             description=True, location=True, objective=True, policy_commands= True, score=True, moves=True)

    env_id = textworld.gym.register_games([game_file], request_infos, max_episode_steps=max_steps)

    env = gym.make(env_id)  # Start the environment.
    env.max_episode_steps = max_steps
    obs, infos = env.reset()# Start new episode.

    print(obs)

    score, moves, done = 0, 0, False
    return env, obs, infos

In [None]:
# Easy Difficulty

# Load env
game_file = "./Created_Games/MDP_Game_Easy.ulx"
env, obs, infos = load_game(game_file=game_file)

In [None]:
# Infos before processing
obs, score, done, infos = env.step("go east")
for key, value in infos.items():
    print(key, ' : ', value, '\n')

### Needed Info processing
Functions to get the **location** and remove **'look'**,**'invetory'** from the admissible_commands.<br/>
Also the **inventory** doesn't need the string **"You are carrying:"** as our input

In [None]:
# infos["location"] always returns None for some reason
# So we need to get it manually from description

ban_list = ['You', 'are', 'carrying', 'nothing.', '\n']

def find_location(desc):
    desc = desc.split()
    location = desc[1]
    return location
    
# Also we don't need the "inventory" and "look" commands
# nor do we need "examine" commands cause we don't use infos["description"] feedback
def cmd_remover(ad_cmds):
    ad_cmds.remove("inventory")
    ad_cmds.remove("look")
    ad_cmds = [cmd for cmd in ad_cmds if 'examine' not in cmd]
    return ad_cmds

def inv_process(inventory):
    
    try:
        inventory = inventory.split()
        inventory = [item for item in inventory if all(word not in item for word in ban_list)]
        if 'a' in inventory: inventory.remove('a')
        if 'an' in inventory: inventory.remove('an')
        return inventory 
    except:
        print("Inventory cannot be split")
        return infos["inventory"]

def obj_process(commands):
    try:
        objective = commands[-1]
    except:
        objective = infos['last_command']

    return objective
    
def info_process():
    
    # Pre-process inventory
    infos["inventory"] = inv_process(infos["inventory"])
        
    # Find location
    infos["location"] = find_location(infos["description"])

    # Get goal
    infos["objective"] = obj_process(infos["policy_commands"])
    
    # Get only needed commands
    try:
        commands = cmd_remover(infos["admissible_commands"])
        infos["admissible_commands"] = commands
    except:
        print("'Look','invetory' and 'examine' commands already cut\n")
        commands = infos["admissible_commands"]
        
    # Return processed observation
    obs = [infos["location"], infos["inventory"], infos["objective"], infos["entities"]]
    for cmd in commands:
        obs.append(cmd)
        
    return obs

In [None]:
obs = info_process()
for i in obs[0:4]:
    print(i,'\n')
print(obs[4::])

### Extract first model inputs

In [None]:
vocab_size = 200
tokenizer = Tokenizer(num_words = vocab_size)

def tokenize():
    obs = info_process()
    # print(obs)
    tokenizer.fit_on_texts(obs)
    input_info = tokenizer.texts_to_sequences(obs)
    location, inventory, objective, entities = input_info[0:4] # These don't need padding
    commands = pad_sequences(input_info[4::], padding='post') # Pad the commands
    return location, inventory, objective, entities, commands
    
location, inventory, objective, entities, commands = tokenize()
print(tokenizer.word_index)

In [None]:
# Show the padding being done

print(commands)
obs, score, done, infos = env.step("take latchkey")
print(infos["inventory"])
location, inventory, objective, entities, commands = tokenize()
print(commands)
print(infos["admissible_commands"])
print(inventory)

In [None]:

def input_process(location, inventory, objective, entities, commands):
    pre_words = location + inventory + entities
    words = []
    pre_objective = objective
    obj = []
    
    for i in range(np.shape(commands)[0]):
        words.append(pre_words)
        obj.append(pre_objective)
        
    words = tf.convert_to_tensor(words)
    obj = tf.convert_to_tensor(obj)
    commands = tf.convert_to_tensor(commands)
    
    return words, obj, commands
words, objective, commands = input_process(location, inventory, objective, entities, commands)
objective

### Neural Network

In [None]:
# Number of neurons for the layers
embedding_dim = 50
arb_vec = round(embedding_dim/2)

# Same embedding layer for everything
emb_layer = layers.Embedding(vocab_size, embedding_dim)

# The word info (inventory, entities, location) will go into a GRU
word_info = layers.Input(shape=(1,), name="Words(Location,Inventory,Entitites)")
emb_words = emb_layer(word_info)
enc_words = layers.GRU(arb_vec)(emb_words)
WRDS = Model(inputs=word_info, outputs=enc_words)

# The objective will go into a LSTM encoder(Long Short-Term Memory)
# because we care about remembering all the sentense (all the described goals)
obj_info = layers.Input(shape=(1,), name='Objective')
emb_obj = emb_layer(obj_info)
enc_obj = layers.GRU(arb_vec)(emb_obj)
OBJ = Model(inputs=obj_info, outputs=enc_obj)

# The commands will be encoded with a different LSTM
cmds_info = layers.Input(shape=(1,), name='Admissible_Commands')
emb_cmds = emb_layer(cmds_info)
enc_cmds = layers.GRU(arb_vec)(emb_cmds)
CMDS = Model(inputs=cmds_info, outputs=enc_cmds)

# The layers are combined so that value and index are calculated based on all the observations
combined = layers.Concatenate(axis=1)([WRDS.output, OBJ.output, CMDS.output])
pre_index = layers.Dense(10, activation='softmax')(combined)
# Find value,index from a 1-neuron layer
index = layers.Dense(1, activation="sigmoid", name="Probability_of_action")(pre_index)
value = layers.Dense(1, name='Value_of_action')(pre_index)


# The complete model
agent = Model(inputs=[WRDS.input, OBJ.input, CMDS.input], outputs=[index, value])
opt=tf.keras.optimizers.Adam(learning_rate=0.01)
huber_loss=keras.losses.Huber()
agent.compile(optimizer=opt, loss=huber_loss,
              metrics=[tf.keras.metrics.BinaryAccuracy(),
                       tf.keras.metrics.FalseNegatives()])
agent.summary()
agent.save("agent.h5")

In [None]:
# Example Input after tokenization
# We assume batch size to be equal to the number of commands
# in other words produce num_of_commands indexes and values

# Since the rest of the inputs will be the same for every command in the current step
# we need to replicate it that many times
# Assuming num_of_commands = 3

words = np.arange(20)
words = [words, words, words] 

objective = np.arange(20,30)
objective = [objective, objective, objective]

commands = np.zeros([3,4])
commands[1][:] = 1
commands[2][:] = 2

# This way we have the same batch_size for every input, but the only thing changing is the command

words = tf.convert_to_tensor(words)
objective = tf.convert_to_tensor(objective)
commands = tf.convert_to_tensor(commands)

print("Example of inputs")
print("Words: \n{}\nObjective:\n {}\nCommands:\n {}\n".format(words, objective, commands))
[index, value] = agent([words, objective, commands])
print("Index: \n{}\nValue: \n{}\n".format(index,value))

### Parameters

In [None]:
epsilon = np.finfo(np.float32).eps.item() 

# Experience
ep_rewards = []
reward_history = []
value_history = []
action_probs_history = []
percentage_won_overtime = []
moves_history = []
avg_moves = []
won_games = 0

cnt = 0 # Step count

### Training through batches of experience

In [None]:
 def train(max_eps=100, reset=False, render=False, gamma=0.99, lrn_rate=0.01, metrics_bs=10, neg_reward=False):
 
    global ep_rewards, reward_history, value_history, action_probs_history, percentage_won_overtime, moves_history,\
             avg_moves, won_games, cnt, infos

    if reset:
        ep_rewards = []
        reward_history = []
        value_history = []
        action_probs_history = []
        percentage_won_overtime = []
        moves_history = []
        avg_moves = []
        won_games = 0

    start = time.time()

    for ep in range(max_eps):
        obs, infos = env.reset()
        ep_reward = 0
        
        with tf.GradientTape() as tape:
            while True:
                
                # print(cnt)
                cnt += 1 # for debugging
                
                # Every Step tokenize and prepare the inputs
                # Then let model take decision
                location, inventory, objective, entities, commands = tokenize()
                words, objective, commands = input_process(location, inventory, objective, entities, commands)
                action_probs, value = agent([words, objective, commands])
                cmd_len = len(commands) # Basically the batch_size
                
                # Choose command based on probability
                np_probs = np.squeeze(action_probs)
                np_probs = np_probs/np.sum(np_probs) if np.sum(np_probs) > 0 else [1/cmd_len for i in range(cmd_len)] #In case sum is 0
                if np.shape(commands)[0] == 1: # If we have 1 possible action np_probs becomes a float
                    np_probs = [np_probs]      # We need it to be a list
                    action_index = 0
                else:
                    action_index = np.random.choice(cmd_len, p=np_probs)
                action = infos["admissible_commands"][action_index]
                
                # Take step
                obs, reward, done, infos = env.step(action)

                if neg_reward: reward -= 0.5
                
                # Render enviroment
                if render:
                    env.render()
                    
                # Record experience
                reward_history.append(reward)
                value_history.append(value[action_index])
                action_probs_history.append(tf.math.log(action_probs[action_index]))
                
                # Episode accumulative reward
                ep_reward += reward
                
                if done:
                    last_r_index = len(reward_history)-1
                    moves_history.append(infos['moves'])
                    if infos["won"]:
                        ep_reward += 10
                        reward_history[last_r_index] += 10
                        won_games += 1
                    else:
                        ep_reward -= 10
                        reward_history[last_r_index] -= 10
                        print("We lost in {} moves".format(infos["moves"]))
                    break
                
            # Save each episode rewards to see progress
            ep_rewards.append(ep_reward)
            
            # For each reward r calculate corresponding expected value based on experience gathered
            returns = [] # Expected value matrix for each action
            expected_reward = 0
            for r in reward_history[::-1]:
                expected_reward = r + gamma * expected_reward
                returns.insert(0, expected_reward)
                
            # Normalize
            returns = np.array(returns)
            returns = (returns - np.mean(returns)) / (np.std(returns) + epsilon)
            returns = returns.tolist()
            
            # Calculate the loss
            history = zip(action_probs_history, value_history, returns)
            action_losses = []
            value_losses = []
            for log_prob, value, E_value in history:
                Advantage = value - E_value # The return we got minus the return we expected
                action_losses.append(log_prob * Advantage)  # Policy gradient loss

                value_losses.append( # Huber loss
                    huber_loss(tf.expand_dims(value, 0), tf.expand_dims(E_value, 0))
                )
            total_loss = sum(value_losses) + sum(action_losses)
            
            # Backpropagation
            grads = tape.gradient(total_loss, agent.trainable_variables)
            opt.apply_gradients(zip(grads, agent.trainable_variables))
            
            # Reset histories
            reward_history.clear()
            value_history.clear()
            action_probs_history.clear()

        # Metrics
        if (ep+1)%metrics_bs==0:
            percentage_won_overtime.append(won_games/metrics_bs)
            won_games = 0
            avg_moves.append(sum(moves_history[-metrics_bs::])/metrics_bs)
            print("Currently on Episode {}".format(ep+1))
        
    plt.figure(1)
    plt.plot(np.arange(1,len(avg_moves)+1), avg_moves)
    plt.xlabel(f'Every {metrics_bs} episodes')
    plt.ylabel(f'Average moves every {metrics_bs} eps')
    plt.figure(2)
    plt.plot(np.arange(1,len(percentage_won_overtime)+1), percentage_won_overtime)
    plt.xlabel(f'Every {metrics_bs} episodes')
    plt.ylabel(f'Percentage of games won every {metrics_bs} eps')
    plt.show()
    end = time.time(); t_sec = end-start; mins = t_sec//60; secs = t_sec-mins*60
    print(f"Training took {mins} minutes and {secs} seconds")

In [None]:
name1 = 'bedroom'
name2 = 'Bedroom'
if name1==name2.lower(): print("LUL")

In [None]:
def test(render=False, max_eps=1):
    global infos

    for ep in range(max_eps):

        print(f"\n----------- EPISODE {ep+1} -----------")
        actions_taken = []
        step = 0
        obs, infos = env.reset()
        while True:
            step += 1
            if render:
                env.render()

            # Every Step tokenize and prepare the inputs
            # Then let model take decision
            location, inventory, objective, entities, commands = tokenize()
            words, objective, commands = input_process(location, inventory, objective, entities, commands)
            action_probs, value = agent([words, objective, commands])
            cmd_len = len(commands) # Basically the batch_size
            
            # Choose command based on probability
            np_probs = np.squeeze(action_probs)
            np_probs = np_probs/np.sum(np_probs) if np.sum(np_probs) > 0 else [1/cmd_len for i in range(cmd_len)] #In case sum is 0

            if np.shape(commands)[0] == 1: # If we have 1 possible action np_probs becomes a float
                np_probs = [np_probs]      # We need it to be a list
                action_index = 0
            else:
                action_index = np.random.choice(cmd_len, p=np_probs)
            action = infos["admissible_commands"][action_index]
            
            print(f"Action taken in step {step}: {action}")
            actions_taken.append(action)
            
            obs, reward, done, infos = env.step(action)
            
            if done:
                print(f"Finished in {infos['moves']} moves")
                print("Actions taken:")
                for action in actions_taken:
                    print(f"{action} >", end=" ")
                break

In [None]:
# Easy Difficulty

# Train
train(max_eps=500, lrn_rate=0.001, neg_reward=True)
# If we train again it will resume training
train(max_eps=500, lrn_rate=0.001, neg_reward=True)

# Test
test(max_eps=2)