In [61]:
import numpy as np
import tensorflow as tf

In [117]:
import math
import copy
class Gridworld: # Environment
    def __init__(self, n_dim, start, n_obj, min_num, max_num):
        # creates a square gridworld
        self.width = n_dim
        self.height = n_dim
        self.n_dim = n_dim
        self.n_obj = n_obj
        self.min_num = min_num
        self.max_num = max_num
        self.i = start[0]
        self.j = start[1]
        self.grid_mat = np.zeros((self.height, self.width),dtype=int)
        
        # call functions
        self.original_objects = self.create_objects()
        self.place_objects()
        self.current_objects = copy.deepcopy(self.original_objects)
        self.actions = {}

    def actions(self):
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        for i in range(self.n_dim):
            for j in range(self.n_dim):
                if i != 0 and i != self.n_dim-1 and j != 0 and j != self.n_dim-1:
                    self.actions[(i,j)] = ['U','D','R','L']
                else:
                    if i == 0 and j != 0 and j != n_dim-1:
                        self.actions[(i,j)] = ['D','R','L']
                    if i == self.n_dim-1 and j != 0 and j != self.n_dim-1:
                        self.actions[(i,j)] = ['U','R','L']
                    if j == 0 and i != 0 and i != self.n_dim-1:
                        self.actions[(i,j)] = ['U','D','R']
                    if j == self.n_dim-1 and i != 0 and i != self.n_dim-1:
                        self.actions[(i,j)] = ['U','D','L']
                    if i == 0 and j == 0:
                        self.actions[(i,j)] = ['D','R']
                    if i == self.n_dim-1 and j == 0:
                        self.actions[(i,j)] = ['U','R']
                    if j == self.n_dim-1 and i == 0:
                        self.actions[(i,j)] = ['D','L']
                    if j == self.n_dim-1 and i == self.n_dim-1:
                        self.actions[(i,j)] = ['U','L']

    def rewards(self,i,j):
        # rewards should be a dict of: (i, j): r (row, col): reward
        element = self.grid_mat[i,j]
        if element == self.current_objects[-1]:
            self.current_objects.pop(-1)
            if len(self.current_objects) >= 1:
                reward = 100
            if len(self.current_objects) == 0:
                reward = 10000
            
        elif element != 0 and element != self.current_objects[-1]:
            reward = -1000
        elif element == 0:
            reward = -1
        
        return reward

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]
    
    def print_grid(self):
        print (self.grid_mat)
    
    def place_objects(self):
        small_matrix_dim = (self.n_dim+1)/2
        a = np.arange(0,small_matrix_dim**2, dtype=int)
        idx_1d = np.random.choice(a, size=self.n_obj, replace=False, p=None)
        idx_2d = [[2*math.floor(idx/small_matrix_dim), 2*int(idx%small_matrix_dim)] for idx in idx_1d]
#         print ("objects: {}".format(self.original_objects))
        for counter, idx in enumerate(idx_2d):
            self.grid_mat[idx[0], idx[1]] = self.original_objects[counter]

    def create_objects(self):
        a = np.arange(self.min_num, self.max_num+1)
        objects = np.random.choice(a, size=self.n_obj, replace=False, p=None)
#         print ("type: {}".format(type(objects)))
        objects = list(objects)
#         print ("type: {}".format(type(objects)))
        objects.sort(reverse=True)
        return objects
        
    def current_state(self):
        return (self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions
    
    def move(self, action):
    # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards(self.i, self.j)


    
    def undo_move(self, action):
    # these are the opposite of what U/D/L/R should normally do
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # raise an exception if we arrive somewhere we shouldn't be
        # should never happen
        assert(self.current_state() in self.all_states())

    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    def all_states(self):
        return set(self.actions.keys() + self.rewards.keys()) 

In [161]:
# n_dim should be an odd number
grid_w = Gridworld(n_dim= 9, start=[0,1], n_obj=20, min_num=100, max_num=200)
grid_w.print_grid()

[[106   0 164   0   0   0 162   0 111]
 [  0   0   0   0   0   0   0   0   0]
 [180   0 129   0   0   0 128   0 175]
 [  0   0   0   0   0   0   0   0   0]
 [  0   0 165   0 124   0 108   0 153]
 [  0   0   0   0   0   0   0   0   0]
 [197   0 114   0 144   0 102   0 115]
 [  0   0   0   0   0   0   0   0   0]
 [120   0 104   0   0   0 121   0   0]]
