In [3]:
import random

class Environment:
    def __init__(self, size):
        self.size = size
        self.state = (random.randint(0,size-1), random.randint(0,size-1))
        self.board = [[0 for j in range(0, size)] for i in range(0, size)]
    
    def add_food(self, x, y):
        self.board[x][y] = 1

    def set_predator(self, x, y):
        self.board[x][y] = -1

    def set_nest(self, x, y):
        self.board[x][y] = 2

In [1]:
import math
import numpy as np
import random
class Learner:
    def __init__(self, agents, env, walpha, qalpha=0.1, gamma=0.6, epsilon=0.1):
        #hyper parameters
        self.walpha = walpha
        self.qalpha = qalpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.environment = env
        self.agents = agents          #actual agent
        self.wtable = self.__init_w_table__()
        self.qtable = self.__init_q_table__() #rewards table
    
    def __init_q_table__(self):
        table = dict()
        for i in range(0, self.size):
            for j in range(0, self.size):
                table[(i,j)] = np.zeros(len(self.agents.actions))
        return table

    def __init_w_table__(self):
        table = dict()
        for i in range(0, self.size):
            for j in range(0, self.size):
                table[(i,j)] = np.zeros(len(self.agents))
        return table
    
    def run(self):
        done = False
        while not done:
            current_state = self.agents[0].state
            candidates, wvalues = self.get_actions()
            max = - math.inf
            win = -1
            for i in range(len(wvalues)):
                if wvalues[i] > max:
                    win = i
                    max = wvalues[i]
            action = candidates[i]
            next_state, reward, done, info = self.step(action)
            old_value = self.qtable[current_state][action]
            next_max = np.max(self.qtable[next_state])
            new_value = (1 - self.qalpha)*old_value + self.qalpha*(reward + self.gamma*next_max)
            self.qtable[current_state][action] = new_value
            for i in range(len(self.agents)):
                if not i == win:
                    value = self.qtable[current_state][action] - reward - self.gamma*max(self.qtable[next_state])
                    self.wtable[current_state][i] = (1-self.walpha)*self.wtable[current_state] + self.walpha*(value)
            print(info)
            print(f'{current_state}, {action}, {next_state}')

    def get_actions(self):
        s = self.environment.state
        actions = [a.get_action(self.environment.board) for a in self.agents]
        wvalues = [self.wtable[s] for a in self.agents]
        return actions, wvalues

    def step(self, action):
        old_state = self.agent.state
        reward, done = self.get_rewards(old_state, self.agent.getAction(action))
        self.agents[0].action(action)
        next_state = self.agent.state
        if(self.environment.board[old_state[0]][old_state[1]] == 1 and self.agent.food):
            self.environment.board[old_state[0]][old_state[1]] = 0
        info = f'Executed action: {self.agent.getAction(action)} at state {old_state}'
        return next_state, reward, done, info

    def get_rewards(self, state, action):
        if self.environment[state[0]][state[1]] == 2 and action == 'drop':
            return 10, True
        elif self.environment[state[0]][state[1]] == 1 and action == 'pick':
            return 1, True
        elif self.environment[state[0]][state[1]] == -1:
            return -50, False

In [2]:
import random

class Agent:
    def __init__(self, x, y, goal):
        self.state = (x, y)
        self.food = False
        self.goal = goal
        self.actions = list(range(11))

    def right(self):
        self.state[0] += 1
        
    def left(self):
        self.state[0] -= 1
    
    def up(self):
        self.state[1] += 1

    def down(self):
        self.state[1] -= 1
    
    def up_right(self):
        self.state[0] -= 1
        self.state[1] += 1
    
    def up_left(self):
        self.state[0] -= 1
        self.state[1] -= 1

    def down_right(self):
        self.state[0] += 1
        self.state[1] -= 1
    
    def down_left(self):
        self.state[0] -= 1
        self.state[1] += 1

    def stay(self):
        pass

    def pick(self):
        self.food = True
    
    def drop(self):
        self.food = False      
        
    def action(self, action : int):
        match action:
            case 0: self.right()
            case 1: self.left()
            case 2: self.up()
            case 3: self.down()
            case 4: self.up_right()
            case 5: self.up_left()
            case 6: self.down_right()
            case 7: self.down_left()
            case 8: self.stay()
            case 9: self.pick()
            case 10: self.drop()
            case _: raise "Action not allowed"
    
    def get_action(self, board):
        if self.goal == "food":
            res, dir = self.food_neighbourhood(board)
            if res:
                return self.action_token(dir)

    def action_token(self, action : int):
        match action:
            case 0: return 'right'
            case 1: return'left'
            case 2: return'up'
            case 3: return'down'
            case 4: return'up_right'
            case 5: return'up_left'
            case 6: return'down_right'
            case 7: return'down_left'
            case 8: return'stay'
            case 9: return'pick'
            case 10: return'drop'
    
    def food_neighbourhood(self, board):
        dir = 0
        for i in range(-1, 1):
            for j in range(-1, 1):
                if board[i][j] == 1:
                    return True, i - j

In [None]:
def main():
    episodes = 10
    size = 5
    e = Environment(size)
    food_agent = Agent(0,0,"food")
    nest_agent = Agent(0,0, "nest")
    avoid_agent = Agent(0,0, "predator")
    l = Learner([food_agent, nest_agent, avoid_agent], e)
    for i in range(0, episodes):
        print(f"Episode: {i+1}")
        l.run()
        a.reset()
    print(l.qtable)
        
main()