# Implementation of SARSA solving a gridworld environment

In [1]:
import numpy as np
import random as r
from operator import add
import itertools


In [9]:
class Gridworld:

    def __init__(self, size=8, reward=10):
        self.size = size
        self.reward = reward

        # Create 2d list with filled with zeros
        self.grid = [[0] * self.size for i in range(self.size)]

        # Current state of agent
        self.current_state = None
        self.blocked_fields = None
        self.goal_idx = None

    def reset(self) -> int | None:

        # Input error correction
        if self.size < 5 or isinstance(self.size, int) == False:
            print("The size entered for the grid is too small or was no integer. Please try an integer bigger than 4.")

        else:
            # Create negative fields
            for j in range(int(self.size)):
                self.grid[r.randint(0, self.size-1)][r.randint(0, self.size-1)] = r.uniform(-1, 0)

            # Saves indeces of blocked field
            blocked_fields_idx = []
            # Create blocked fields
            for k in range(int(self.size/1.5)):
                blocked_fields = [r.randint(0, self.size-1), r.randint(0, self.size-1)]
                blocked_fields_idx.append(blocked_fields)

                self.grid[blocked_fields[0]][blocked_fields[1]] = "X"

            # Save list of blocked fields for step() function
            self.blocked_fields = blocked_fields_idx

            # initial index of terminal state
            goal_idx = [r.randint(0, self.size-1), r.randint(0, self.size-1)]

            # Check adjacent fields to index of terminal state and update until no blocked field is adjacent
            while list(map(add, goal_idx, [0, -1])) in blocked_fields_idx or list(
                    map(add, goal_idx, [0, +1])) in blocked_fields_idx or list(
                    map(add, goal_idx, [-1, 0])) in blocked_fields_idx or list(
                    map(add, goal_idx, [+1, 0])) in blocked_fields_idx:
                goal_idx = [r.randint(0, self.size-1), r.randint(0, self.size-1)]

            # Save index of terminal state for step() function
            self.goal_idx = goal_idx

            # Create terminal state with positive reward
            self.grid[goal_idx[0]][goal_idx[1]] = self.reward

            starting_point_idx = [r.randint(0, self.size-1), r.randint(0, self.size-1)]

            # Update starting point until it is a zero value (Not blocked- or terminal field)
            while self.grid[starting_point_idx[0]][starting_point_idx[1]] != 0:
                starting_point_idx = [r.randint(0, self.size-1), r.randint(0, self.size-1)]

            # Set current state to starting state
            self.current_state = starting_point_idx

            return self.state_as_int(self.current_state)

    def step(self, action: int, new_state: list) -> tuple[int, int, bool]:
        '''
        Moves agent in grid

        args:
        action(int): How agent should move (0=left, 1=right, 2=up, 3=down)
        '''
        self.current_state: list = new_state

        # Left
        if action == 0:
            # Compute result of action
            test_state = list(map(add, self.current_state, [0, -1]))

            # New list index is out of bounds (left)
            if test_state[1] == -1:
                print("Wall")

            # New list index is on field marked with "X"
            elif test_state in self.blocked_fields:
                print("field blocked")

            # Action lead to terminal state
            elif test_state == self.goal_idx:
                print("Terminal state reached")
                self.current_state = test_state
                done = True
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

            # Update state (take action)
            else:
                self.current_state = test_state
                done = False
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

        # Right
        if action == 1:
            # Compute result of action
            test_state = list(map(add, self.current_state, [0, +1]))

            # New list index is out of bounds (right)
            if test_state[1] == self.size:
                print("Wall")

            # New list index is on field marked with "X"
            elif test_state in self.blocked_fields:
                print("field blocked")

            # Action lead to terminal state
            elif test_state == self.goal_idx:
                print("Terminal state reached")
                self.current_state = test_state
                done = True
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

            # Update state (take action)
            else:
                self.current_state = test_state
                done = False
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

        # Up
        if action == 2:
            # Compute result of action
            test_state = list(map(add, self.current_state, [-1, 0]))

            # New list index is out of bounds (right)
            if test_state[0] == -1:
                print("Wall")

            # New list index is on field marked with "X"
            elif test_state in self.blocked_fields:
                print("field blocked")

            # Action lead to terminal state
            elif test_state == self.goal_idx:
                print("Terminal state reached")
                self.current_state = test_state
                done = True
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

            # Update state (take action)
            else:
                self.current_state = test_state
                done = False
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

        # down
        if action == 3:
            # Compute result of action
            test_state = list(map(add, self.current_state, [+1, 0]))

            # New list index is out of bounds (right)
            if test_state[0] == self.size:
                print("Wall")

            # New list index is on field marked with "X"
            elif test_state in self.blocked_fields:
                print("field blocked")

            # Action lead to terminal state
            elif test_state == self.goal_idx:
                print("Terminal state reached")
                self.current_state = test_state
                done = True
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

            # Update state (take action)
            else:
                self.current_state = test_state
                done = False
                # Return reward for step
                return self.state_as_int(self.current_state), self.grid[self.current_state[0]][
                    self.current_state[1]], done,

        print(self.current_state)

    def state_as_int(self, state: list) -> int:
        return (state[0] * self.size) + state[1]

    def visualise(self):
        print(self.current_state, self.goal_idx)
        # Print in matrix format - Rest of program still uses lists
        print(np.matrix(self.grid))


# SARSA implementation

In [7]:
# Hyperparameters
episodes = 9000
max_steps = 100
alpha = 0.8
gamma = 0.9

# Initializing the reward
reward = 0

state_space = 10
action_space = 4
grid = Gridworld(10)

q_values = np.zeros(((state_space*state_space), action_space))


In [10]:

# Function to learn the Q-value
def update(state, state2, reward, action, action2):
    predict = q_values[state, action]
    target = reward + gamma * q_values[state2, action2]
    q_values[state, action] = q_values[state, action] + alpha * (target - predict)


# Starting the SARSA learning
for episode in range(episodes):
    t = 0
    state = grid.reset()
    action = np.argmax(q_values[state, :])

    while t < max_steps:

        # Getting the next state
        new_state, reward, done = grid.step(action, state)

        # Choosing the next action
        new_action = np.argmax(q_values[new_state, :])

        # Updating the Q-value matrix
        update(state, new_state, reward, action, new_action)

        state = new_state
        action = new_action

        # Updating the respective vaLues
        t += 1
        #reward += 1

        # If at the end of learning process
        if done:
            break


TypeError: 'int' object is not iterable

15