In [61]:

import tensorflow as tf

In [172]:
import numpy as np
import math
import copy
class Gridworld: # Environment
    def __init__(self, n_dim, start, n_obj, min_num, max_num):
        # creates a square gridworld
        self.width = n_dim
        self.height = n_dim
        self.n_dim = n_dim
        self.n_obj = n_obj
        self.min_num = min_num
        self.max_num = max_num
        self.i = start[0]
        self.j = start[1]
        self.grid_mat = np.zeros((self.height, self.width),dtype=int)
        
        # call functions
        self.original_objects = self.create_objects()
        self.place_objects()
        self.current_objects = copy.deepcopy(self.original_objects)
        self.actions = {}
        self.set_actions()

    def set_actions(self):
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        for i in range(self.n_dim):
            for j in range(self.n_dim):
                if i != 0 and i != self.n_dim-1 and j != 0 and j != self.n_dim-1:
                    self.actions[(i,j)] = ['U','D','R','L']
                else:
                    if i == 0 and j != 0 and j != self.n_dim-1:
                        self.actions[(i,j)] = ['D','R','L']
                    if i == self.n_dim-1 and j != 0 and j != self.n_dim-1:
                        self.actions[(i,j)] = ['U','R','L']
                    if j == 0 and i != 0 and i != self.n_dim-1:
                        self.actions[(i,j)] = ['U','D','R']
                    if j == self.n_dim-1 and i != 0 and i != self.n_dim-1:
                        self.actions[(i,j)] = ['U','D','L']
                    if i == 0 and j == 0:
                        self.actions[(i,j)] = ['D','R']
                    if i == self.n_dim-1 and j == 0:
                        self.actions[(i,j)] = ['U','R']
                    if j == self.n_dim-1 and i == 0:
                        self.actions[(i,j)] = ['D','L']
                    if j == self.n_dim-1 and i == self.n_dim-1:
                        self.actions[(i,j)] = ['U','L']

    def rewards(self,i,j):
        # rewards should be a dict of: (i, j): r (row, col): reward
        element = self.grid_mat[i,j]
        if element == self.current_objects[-1]:
            self.current_objects.pop(-1)
            if len(self.current_objects) >= 1:
                reward = 100
            if len(self.current_objects) == 0:
                reward = 10000
            
        elif element != 0 and element != self.current_objects[-1]:
            reward = -1000
        elif element == 0:
            reward = -1
        
        return reward

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]
    
    def print_grid(self):
        print (self.grid_mat)
    
    def place_objects(self):
        small_matrix_dim = (self.n_dim+1)/2
        a = np.arange(0,small_matrix_dim**2, dtype=int)
        idx_1d = np.random.choice(a, size=self.n_obj, replace=False, p=None)
        idx_2d = [[2*math.floor(idx/small_matrix_dim), 2*int(idx%small_matrix_dim)] for idx in idx_1d]
        for counter, idx in enumerate(idx_2d):
            self.grid_mat[idx[0], idx[1]] = self.original_objects[counter]

    def create_objects(self):
        a = np.arange(self.min_num, self.max_num+1)
        objects = np.random.choice(a, size=self.n_obj, replace=False, p=None)
        objects = list(objects)
        objects.sort(reverse=True)
        return objects
        
    def current_state(self):
        return (self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions
    
    def take_action(self, action):
    # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards(self.i, self.j), (self.i, self.j)

    def undo_move(self, action):
    # these are the opposite of what U/D/L/R should normally do
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # raise an exception if we arrive somewhere we shouldn't be
        # should never happen
        assert(self.current_state() in self.all_states())

    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    def all_states(self):
        return set(self.actions.keys() + self.rewards.keys()) 

In [173]:
# n_dim should be an odd number
grid_w = Gridworld(n_dim= 9, start=[0,1], n_obj=10, min_num=1, max_num=100)
grid_w.print_grid()

[[78  0 77  0  0  0  0  0 95]
 [ 0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 56  0  0  0 73]
 [ 0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0]
 [11  0  0  0  0  0 12  0  0]
 [ 0  0  0  0  0  0  0  0  0]
 [97  0  0  0 69  0  0  0  1]]


In [174]:
# meta controller operates using Q-learning and relational networks
class Meta_controller:
    def __init__(self, goals):
        self.DM = [] 
        self.goals = goals
        

class Controller:
    def __init__(self,g):
        self.DC = []
        self.current_goal = g
        
        
def EpsGreedy(x, beta, eps, Q):
    if np.random.rand() < eps:
        return np.random.choice(np.arange(len(beta)), size=1)
    else:
        idx = np.argmax[Q(x,m) for m in beta]
        return beta(idx)
    

def UpdateParams(L,D,mini_size):
    idxs = np.random.choice(len(D), size = mini_size)
    examples = D[idxs]
    

In [None]:
mini_batch = 10
# initialize the grid_w
grid_w = Gridworld(n_dim= 9, start=[0,1], n_obj=10, min_num=1, max_num=100)
goals = grid_w.original_objects
eps1 = np.ones((n_obj,1))
eps2 = 1
meta_controller = meta_controller(grid_w.original_objects)

num_epis = 1000
for i in range(num_epis):
    s = [0,1]
    g_idx = epsGreedy(s, goals, eps2, Q2)

    done = False
    while not done:
        g = goals[g_idx]
        controller = Controller(g)
        F = 0
        s0 = s
        while True: #(s is terminal or goal g reached)
            a_idx = EpsGreedy((s,g),grid_w.actions[s],eps1[g_idx])
            a = grid_w.actions[s][a_idx]
            f, s_prime = grid_w.take_action(a)
            r = f
            # obtain intrinsic reward for the controller
            controller.DC.append([(s,g),a,r,(s_prime,g)])
            UpdateParams(L1,controller.DC)
            UpdateParams(L2,meta_controller.DM)
            F += f
            s = s_prime
        meta_controller.DM.append((s0,g,F,s_prime))
        if s not terminal:
            g_idx = epsGreedy(s, goals, eps2, Q2)
    eps1 *= 0.99
    eps2 *= 0.99

In [None]:
# from https://github.com/EthanMacdonald/h-DQN/blob/master/agent/hDQN.py

import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD, RMSprop

# Default architecture for the meta controller
default_meta_layers = [Dense] * 5
default_meta_inits = ['lecun_uniform', 'lecun_uniform', 'lecun_uniform', 'lecun_uniform', 'lecun_uniform']
default_meta_nodes = [6, 30, 30, 30, 6]
default_meta_activations = ['relu', 'relu', 'relu', 'relu', 'relu']
default_meta_loss = "mean_squared_error"
default_meta_optimizer=RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06)
default_meta_n_samples = 1000
default_meta_epsilon = 1.0;

# Default architectures for the lower level controller/actor
default_layers = [Dense] * 5
default_inits = ['lecun_uniform'] * 5
default_nodes = [12, 30, 30, 30, 2]
default_activations = ['relu'] * 5
default_loss = "mean_squared_error"
default_optimizer=RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06)
default_n_samples = 1000
default_gamma = 0.975
default_epsilon = 1.0
default_actor_epsilon = [1.0]*6
default_tau = 0.001

class hDQN:

    def __init__(self, meta_layers=default_meta_layers, meta_inits=default_meta_inits,
                meta_nodes=default_meta_nodes, meta_activations=default_meta_activations,
                meta_loss=default_meta_loss, meta_optimizer=default_meta_optimizer,
                layers=default_layers, inits=default_inits, nodes=default_nodes,
                activations=default_activations, loss=default_loss,
                optimizer=default_optimizer, n_samples=default_n_samples,
                meta_n_samples=default_meta_n_samples, gamma=default_gamma,
                meta_epsilon=default_meta_epsilon, epsilon=default_epsilon, actor_epsilon = default_actor_epsilon, tau = default_tau):
        self.meta_layers = meta_layers
        self.meta_inits = meta_inits
        self.meta_nodes = meta_nodes
        self.meta_activations = meta_activations
        self.meta_loss = meta_loss
        self.meta_optimizer = meta_optimizer
        self.layers = layers
        self.inits = inits
        self.nodes = nodes
        self.activations = activations
        self.loss = loss
        self.optimizer = optimizer
        self.meta_controller = self.meta_controller()
        self.target_meta_controller = self.target_meta_controller()
        self.actor = self.actor()
        self.target_actor = self.target_actor()
        self.goal_selected = np.ones(6)
        self.goal_success = np.zeros(6)
        self.meta_epsilon = meta_epsilon
        self.actor_epsilon = actor_epsilon
        self.n_samples = n_samples
        self.meta_n_samples = meta_n_samples
        self.gamma = gamma
        self.target_tau = tau
        self.memory = []
        self.meta_memory = []

    def meta_controller(self):
        meta = Sequential()
        meta.add(self.meta_layers[0](self.meta_nodes[0], init=self.meta_inits[0], input_shape=(self.meta_nodes[0],)))
        meta.add(Activation(self.meta_activations[0]))
        for layer, init, node, activation in list(zip(self.meta_layers, self.meta_inits, self.meta_nodes, self.meta_activations))[1:]:
            meta.add(layer(node, init=init, input_shape=(node,)))
            meta.add(Activation(activation))
            print("meta node: " + str(node))
        meta.compile(loss=self.meta_loss, optimizer=self.meta_optimizer)
        return meta
    
    def target_meta_controller(self):
        meta = Sequential()
        meta.add(self.meta_layers[0](self.meta_nodes[0], init=self.meta_inits[0], input_shape=(self.meta_nodes[0],)))
        meta.add(Activation(self.meta_activations[0]))
        for layer, init, node, activation in list(zip(self.meta_layers, self.meta_inits, self.meta_nodes, self.meta_activations))[1:]:
            meta.add(layer(node, init=init, input_shape=(node,)))
            meta.add(Activation(activation))
            print("meta node: " + str(node))
        meta.compile(loss=self.meta_loss, optimizer=self.meta_optimizer)
        return meta


    def actor(self):
        actor = Sequential()
        actor.add(self.layers[0](self.nodes[0], init=self.inits[0], input_shape=(self.nodes[0],)))
        actor.add(Activation(self.activations[0]))
        for layer, init, node, activation in list(zip(self.layers, self.inits, self.nodes, self.activations))[1:]:
            print(node)
            actor.add(layer(node, init=init, input_shape=(node,)))
            actor.add(Activation(activation))
        actor.compile(loss=self.loss, optimizer=self.optimizer)
        return actor
    
    def target_actor(self):
        actor = Sequential()
        actor.add(self.layers[0](self.nodes[0], init=self.inits[0], input_shape=(self.nodes[0],)))
        actor.add(Activation(self.activations[0]))
        for layer, init, node, activation in list(zip(self.layers, self.inits, self.nodes, self.activations))[1:]:
            print(node)
            actor.add(layer(node, init=init, input_shape=(node,)))
            actor.add(Activation(activation))
        actor.compile(loss=self.loss, optimizer=self.optimizer)
        return actor

    def select_move(self, state, goal, goal_value):
        vector = np.concatenate([state, goal], axis=1)
        if random.random() < self.actor_epsilon[goal_value-1]:
            return np.argmax(self.actor.predict(vector, verbose=0))
        return random.choice([0,1])

    def select_goal(self, state):
        if self.meta_epsilon < random.random():
            pred = self.meta_controller.predict(state, verbose=0)
            print("pred shape: " + str(pred.shape))
            return np.argmax(pred)+1
        print("Exploring");
        return random.choice([1,2,3,4,5,6])

    def criticize(self, goal, next_state):
        return 1.0 if goal == next_state else 0.0

    def store(self, experience, meta=False):
        if meta:
            self.meta_memory.append(experience)
            if len(self.meta_memory) > 1000000:
                self.meta_memory = self.meta_memory[-100:]
        else:
            self.memory.append(experience)
            if len(self.memory) > 1000000:
                self.memory = self.memory[-1000000:]

    def _update(self):
        exps = [random.choice(self.memory) for _ in range(self.n_samples)]
        state_vectors = np.squeeze(np.asarray([np.concatenate([exp.state, exp.goal], axis=1) for exp in exps]))
        next_state_vectors = np.squeeze(np.asarray([np.concatenate([exp.next_state, exp.goal], axis=1) for exp in exps]))
        try:
            reward_vectors = self.actor.predict(state_vectors, verbose=0)
        except Exception as e:
            state_vectors = np.expand_dims(state_vectors, axis=0)
            reward_vectors = self.actor.predict(state_vectors, verbose=0)
        
        try:
            next_state_reward_vectors = self.target_actor.predict(next_state_vectors, verbose=0)
        except Exception as e:
            next_state_vectors = np.expand_dims(next_state_vectors, axis=0)
            next_state_reward_vectors = self.target_actor.predict(next_state_vectors, verbose=0)
        
        for i, exp in enumerate(exps):
            reward_vectors[i][exp.action] = exp.reward
            if not exp.done:
                reward_vectors[i][exp.action] += self.gamma * max(next_state_reward_vectors[i])
        reward_vectors = np.asarray(reward_vectors)
        self.actor.fit(state_vectors, reward_vectors, verbose=0)
        
        #Update target network
        actor_weights = self.actor.get_weights()
        actor_target_weights = self.target_actor.get_weights()
        for i in range(len(actor_weights)):
            actor_target_weights[i] = self.target_tau * actor_weights[i] + (1 - self.target_tau) * actor_target_weights[i]
        self.target_actor.set_weights(actor_target_weights)

    def _update_meta(self):
        if 0 < len(self.meta_memory):
            exps = [random.choice(self.meta_memory) for _ in range(self.meta_n_samples)]
            state_vectors = np.squeeze(np.asarray([exp.state for exp in exps]))
            next_state_vectors = np.squeeze(np.asarray([exp.next_state for exp in exps]))
            try:
                reward_vectors = self.meta_controller.predict(state_vectors, verbose=0)
            except Exception as e:
                state_vectors = np.expand_dims(state_vectors, axis=0)
                reward_vectors = self.meta_controller.predict(state_vectors, verbose=0)
            
            try:
                next_state_reward_vectors = self.target_meta_controller.predict(next_state_vectors, verbose=0)
            except Exception as e:
                next_state_vectors = np.expand_dims(next_state_vectors, axis=0)
                next_state_reward_vectors = self.target_meta_controller.predict(next_state_vectors, verbose=0)
            
            for i, exp in enumerate(exps):
                reward_vectors[i][np.argmax(exp.goal)] = exp.reward
                if not exp.done:
                    reward_vectors[i][np.argmax(exp.goal)] += self.gamma * max(next_state_reward_vectors[i])
            self.meta_controller.fit(state_vectors, reward_vectors, verbose=0)
            
            #Update target network
            meta_weights = self.meta_controller.get_weights()
            meta_target_weights = self.target_meta_controller.get_weights()
            for i in range(len(meta_weights)):
                meta_target_weights[i] = self.target_tau * meta_weights[i] + (1 - self.target_tau) * meta_target_weights[i]
            self.target_meta_controller.set_weights(meta_target_weights)

    def update(self, meta=False):
        if meta:
            self._update_meta()
        else:
            self._update()