In [1]:
import numpy as np
import random
import time
import matplotlib
import pandas as pd
import sys
import plotting
import numpy as np

In [2]:
class environment:
    def __init__(self):
        self.env_rows = 4
        self.env_columns = 4
        self.actions = ['up', 'right', 'down', 'left']
        self.V = np.zeros((self.env_rows, self.env_columns))
        self.P = np.array([[[0,0.5,0.5,0],  ## transition probability
                          [0,0.33,0.33,0.33],
                          [0,0,0,0],
                          [0,0,0.5,0.5]],
                          [[0.333,0.333,0.333,0],
                           [0,0,0,0],
                           [0.25,0.25,0.25,0.25],
                           [0,0,0,0]],
                           [[0.333,0.333,0.333,0],
                           [0.25,0.25,0.25,0.25],
                           [0.25,0.25,0.25,0.25],
                           [0.333,0,0.333,0.333]],
                           [[0,0,0,0],
                           [0.333,0.333,0,0.333],
                           [0,0,0,0],
                           [0.333,0,0.333,0.333]]]
                           )  ## probability transition
        self.policy = np.zeros((self.env_rows, self.env_columns))
        for i in range(self.env_rows):
            for j in range(self.env_columns):
                self.policy[i][j] = np.argmax(self.P[i,j])
        self.rewards = np.full((self.env_rows, self.env_columns), -1)
        self.rewards[0,2] = 100
        self.fire = {}
        self.fire[0] = []
        self.fire[1] = [1,3]
        self.fire[2] = []
        self.fire[3] = [0,2]
        for row_index in range(self.env_rows):
            for column_index in self.fire[row_index]:
                self.rewards[row_index, column_index] = -100

In [4]:
env = environment()

In [5]:
V = env.V
policy = env.policy
probability = env.P
rewards = env.rewards

In [3]:
def parameters():
    
    epsilon = 0.9 #the percentage of time when we should take the best action (instead of a random action)
    discount_factor = 0.2 #discount factor for future rewards (gamma)
    learning_rate = 0.8 #the rate at which the agent should learn
    num_episodes = 1000
    episode_lengths = np.zeros(num_episodes)
    episode_rewards = np.zeros(num_episodes)
    return epsilon, discount_factor, learning_rate, num_episodes, episode_lengths, episode_rewards

In [6]:
epsilon, discount_factor, learning_rate, num_episodes, episode_lengths, episode_rewards = parameters()

In [7]:
def get_next_location(current_row_index, current_column_index, action_index):
    new_row_index = current_row_index
    new_column_index = current_column_index
    if env.actions[action_index] == 'up' and current_row_index  > 0:
        new_row_index -= 1
    elif env.actions[action_index] == 'right' and current_column_index < env.env_columns - 1:
        new_column_index += 1
    elif env.actions[action_index] == 'down' and current_row_index < env.env_rows - 1:
        new_row_index += 1
    elif env.actions[action_index] == 'left' and current_column_index > 0:
        new_column_index -= 1
    return new_row_index, new_column_index

In [12]:
def value_iteration(theta=1e-9):

    while True: 
        delta = 0
        for i in range(len(V)):
            for j in range(len(V[i])):
                v = V[i][j]
                action_value_array = np.array([])
                old_action = policy[i,j]
                for action, action_probability in enumerate(probability[i][j]):
                #print(action, action_probability)
                    new_row_index, new_column_index = get_next_location(i, j, action)
                    action_value = rewards[new_row_index][new_column_index] + discount_factor * V[new_row_index][new_column_index]
                    action_value_array = np.append(action_value_array, action_value)
                optimal_action = np.argmax(action_value_array)
                policy[i][j] = optimal_action
                V[i,j] = action_value_array[optimal_action]

                delta = max(delta, np.abs(V[i,j] - v))
                # Terminate if value change is insignificant
        if delta < theta:
            return V, policy

In [13]:
V, optimal_policy = value_iteration(theta=1e-9)

In [14]:
optimal_policy

array([[1., 1., 0., 3.],
       [0., 0., 0., 0.],
       [0., 1., 0., 3.],
       [0., 0., 0., 0.]])