# Install Libraries

In [None]:
!pip install gym

In [None]:
!git clone https://github.com/zhpinkman/armed-bandit.git

In [None]:
!pip install ./armed-bandit

# Import Libraries

In [1]:
from amalearn.reward import RewardBase
from amalearn.agent import AgentBase
from amalearn.environment import EnvironmentBase

import gym
from gym.spaces import Discrete, Box




In [9]:
import random
from copy import deepcopy
import matplotlib.pyplot as plt
import numpy as np
from prettytable import PrettyTable
import math

# Environment

In [13]:
class Environment(EnvironmentBase):
    def __init__(self, obstacle = [] ,id = 0, action_count=9, actionPrice = -1, stopActionPrice = -1, goalReward = 100
                 , punish=-10, j_limit = 10, i_limit = 10, p = 0.8, container=None):
        """
        initialize your variables
        """
        
        self.obstacle = obstacle
        
        self.x_min = 1
        self.x_max = i_limit
        
        self.y_min = 1
        self.y_max = j_limit
        
        self.reset()
        
        self.action_count = action_count
        self.actionPrice = actionPrice
        self.stopActionPrice = stopActionPrice
        self.goalReward = goalReward
        self.punish = punish
        self.p = p
        
        action_space = Discrete(action_count)
        state_space = Box(low=1, high=max(i_limit, j_limit), shape=(1,2), dtype=int)
        
        ######################
        ###     Action     ###
        ###                ###
        ### 0 1 2  ↖ ↑ ↗   ###
        ### 3 4 5  ← • →   ###
        ### 6 7 8  ↙ ↓ ↘  ###
        #####################
        
        #####################
        ###     State     ###
        ###               ###
        ###    (i, j)     ###
        #####################
        
        self.action_list = list(range(1,10))
        self.state_list = []
        
        for i in range(1, i_limit+1):
            for j in range(1, j_limit+1):
                self.state_list.append(np.array([i, j]))
        
        super(Environment, self).__init__(action_space=action_space, state_space=state_space, id=id ,container=container)

        
    def isStatePossible(self, state):
        """if given state is possible (not out of the grid and not obstacle) return ture"""
        if self.x_min <= state[0] <= self.x_max and self.y_min <= state[1] <= self.y_max:
            for obstacle_item in self.obstacle:
                if (state==obstacle_item).all():
                    return False
            return True
        else:
            return False
    
    
    def isAccessible(self, state, state_p):
        """if given state is Accesible (we can reach state_p by doing an action from state) return true"""
        return abs(state[0]-state_p[0]) <= 1 and abs(state[1] - state_p[1]) <= 1 and self.isStatePossible(state_p)
            
    def getTransitionStatesAndProbs(self, state, action, state_p):
        """return probability of transition or T(sp,a,s)"""
        
        actions = self.available_actions_state(state)
        
        if (self.calculate_next_state(state,action)==state_p).all():
            if self.isAccessible(state, state_p):
                return self.p + (1-self.p) / len(actions)
            else:
                return self.p
        else:
            if self.isAccessible(state, state_p):
                return (1-self.p) / len(actions)
            else:
                return 0

    def getReward(self, state, action, state_p):
        """return reward of transition"""
        
        if action == 4:
            action_price = self.stopActionPrice
        else:
            action_price = self.actionPrice
        
        if self.terminated_state(state_p):
            return action_price + self.goalReward
        
        if self.isStatePossible(state_p):
            return action_price
        else:
            return action_price + self.punish
        
    def sample_all_rewards(self):
        return 
    
    def calculate_reward(self, action):
        return self.getReward(self.current_state, action, self.calculate_next_state(self.current_state, action))

    def available_states_state(self, state):
        states = []
        for i in [-1, 0, +1]:
            for j in [-1, 0, +1]:
                new_state = np.array([state[0]+i, state[1]+j])
                
                if self.isAccessible(state, new_state):
                    states.append(new_state)
        return states
    
    def terminated(self):
        return self.terminated_state(self.current_state)
    
    def terminated_state(self, state):
        return (state==np.array([1,1])).all()
        
    def observe(self):
        return self.current_state 

    def available_actions(self):
        return self.available_actions_state(self.current_state)
    
    def available_actions_state(self, state):
        output_actions = []
        for action in range(self.action_count):
            next_state = self.calculate_next_state(state, action)
            
            if self.isAccessible(state, next_state):
                output_actions.append(action)
        
        return output_actions
        
    
    def calculate_next_state(self, state, action):
        return np.array([state[0] + (action%3 -1), state[1] + (int(action/3)-1) ])
        
    def next_state(self, action):
        actions = self.available_actions()
        
        if action not in actions:
            actions.append(action)
                
        probabilities = []
                
        for action2 in actions:
            state2 = self.calculate_next_state(self.current_state, action2)
            probabilities.append(self.getTransitionStatesAndProbs(self.current_state, action, state2))
        
        final_action = random.choices(population=actions, weights=probabilities, k=1)[0]
        
        real_next_state = self.calculate_next_state(self.current_state, final_action)
        
        if not self.isStatePossible(real_next_state):
            real_next_state = self.current_state
        
        self.last_action = action
        
        self.sliped = not (final_action==action)
        
        self.current_state = real_next_state
        
        return

    def reset(self):
        self.current_state = np.array([15, 15])
        
        self.last_action = None
        self.sliped = None

    def render(self, mode='human'):
        print(f"{self.current_state} \t {self.last_action} \t {self.sliped}")
        return 

    def close(self):
        return

In [14]:
grid_states =  [np.array([7, 1]), np.array([8, 1]), np.array([7, 2]), np.array([8, 2])
                ,np.array([7, 3]), np.array([8, 3]), np.array([7, 4]), np.array([8, 4])
                ,np.array([13, 8]), np.array([14, 8]), np.array([15, 8])
                ,np.array([13, 9]), np.array([14, 9]), np.array([15, 9])
                ,np.array([6, 12]), np.array([7, 12]), np.array([6, 13]), np.array([7, 13])
                ,np.array([6, 14]), np.array([7, 14]), np.array([6, 15]), np.array([7, 15])]

# Agent

In [32]:
import numpy as np

class Agent(AgentBase):
    def __init__(self, id, environment, discount, theta):
        self.environment = environment
        
        # init V
        self.V = {}
        
        # init policy
        self.policy = {}
        
        super(Agent, self).__init__(id, environment)
        
        self.discount = discount
        
        self.theta = theta

        # initialize a random policy and V(s) = 0 for each state        
        self.value_initialization()
        
        self.policy_initialization()
    
    def value_initialization(self):
        for state in self.environment.state_list:
            self.V[tuple(state)] = 0
        
    def policy_initialization(self):
        for state in self.environment.state_list:
            self.policy[tuple(state)] = random.choice(self.environment.action_list)
        
    def policy_evaluation(self):
        new_V = {}

        delta = 0
        for state in self.environment.state_list:
            
            available_states  = self.environment.available_states_state(state)

            action = self.policy[tuple(state)]
            
            sum = 0
            
            for state_p in available_states:
                p_sp = self.environment.getTransitionStatesAndProbs(state, action, state_p)
                r_sp = self.environment.getReward(state, action, state_p)
                v_sp = self.V[tuple(state_p)]

                sum += p_sp * (r_sp + self.discount * v_sp)

            new_V[tuple(state)] = sum
            delta = max(delta, abs(self.V[tuple(state)] - new_V[tuple(state)]))
        
        print(f"delta = {round(delta, 2)}")
        self.V = deepcopy(new_V)

        if delta < self.theta["delta_treshold"]:
            print("Value convergenced")

        return delta < self.theta["delta_treshold"]
    
    def policy_improvement(self):
        policy_stable = True
        
        new_policy = {}
        
        for state in self.environment.state_list:

            available_actions = self.environment.available_actions_state(state)
            available_states  = self.environment.available_states_state(state)
            
            max_value = -math.inf
            argmax = None
            
            for action in available_actions:
                sum = 0
                for state_p in available_states:
                    p_sp = self.environment.getTransitionStatesAndProbs(state, action, state_p)
                    v_sp = self.V[tuple(state_p)]

                    sum += p_sp * v_sp
            
                if sum > max_value:
                    max_value = sum
                    argmax = action
                    
            new_policy[tuple(state)] = argmax
            
            if self.policy[tuple(state)] != argmax:
                policy_stable = False
        
        self.policy = deepcopy(new_policy)
        if policy_stable:
            print("Policy convergenced")
            
        return policy_stable
    
    
    def policy_iteration(self):
        for iter in range(self.theta["max_iter"]):
            print(f"iter = {iter} -> ", end="")

            flag_value = self.policy_evaluation()

            flag_policy = self.policy_improvement()

            if flag_value or flag_policy:
                break
                
    def value_iteration(self):
        for iter in range(self.theta["max_iter"]):
            new_V = {}

            delta = 0
            for state in self.environment.state_list:
                new_V[tuple(state)] = -math.inf

                available_actions = self.environment.available_actions_state(state)
                available_states  = self.environment.available_states_state(state)

                for action in available_actions:
                    sum = 0
                    for state_p in available_states:
                        p_sp = self.environment.getTransitionStatesAndProbs(state, action, state_p)
                        r_sp = self.environment.getReward(state, action, state_p)
                        v_sp = self.V[tuple(state_p)]

                        sum += p_sp * (r_sp + self.discount * v_sp)

                    new_V[tuple(state)] = max(new_V[tuple(state)], sum)
                delta = max(delta, abs(self.V[tuple(state)] - new_V[tuple(state)]))
                
            print(f"iter = {iter} -> delta = {round(delta, 2)}")
            self.V = deepcopy(new_V)

            if delta < self.theta["delta_treshold"]:
                print("Value convergenced")
                break
        
        self.policy_extraction()
    
    def policy_extraction(self):
        for state in self.environment.state_list:

            available_actions = self.environment.available_actions_state(state)
            available_states  = self.environment.available_states_state(state)
            
            max_value = -math.inf
            argmax = None
            
            for action in available_actions:
                sum = 0
                for state_p in available_states:
                    p_sp = self.environment.getTransitionStatesAndProbs(state, action, state_p)
                    v_sp = self.V[tuple(state_p)]

                    sum += p_sp * v_sp
            
                if sum > max_value:
                    max_value = sum
                    argmax = action
                    
            self.policy[tuple(state)] = argmax
    
    def print_value(self):
        p = PrettyTable()

        for j in range(1,16):
            row = []
            for i in range(1, 16):
                if self.environment.isStatePossible(np.array([i,j])):
                    row.append(int(self.V[(i,j)]))
                else:
                    row.append("###")
                
            p.add_row(row)

        print (p.get_string(header=False, border=True))
    
    def print_policy(self):
        p = PrettyTable()

        for j in range(1,16):
            row = []
            for i in range(1, 16):
                if self.environment.isStatePossible(np.array([i,j])):
                    row.append(self.action_symbol(int(self.policy[(i,j)])))
                else:
                    row.append("▮")
                
                
            p.add_row(row)

        print (p.get_string(header=False, border=True))
    
    def take_action(self) -> (object, float, bool, object):
        # observation, reward, done, info
        return self.environment.step(random.choice(self.environment.action_list))
    
    def action_symbol(self, action):
        if action == 0:
            return "↖"
        elif action == 1:
            return "↑"
        elif action == 2:
            return "↗"
        elif action == 3:
            return "←"
        elif action == 4:
            return "•"
        elif action == 5:
            return "→"
        elif action == 6:
            return "↙"
        elif action == 7:
            return "↓"
        elif action == 8:
            return "↘"

# Run

## Part 1

In [57]:
base_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -0.01, stopActionPrice = -0.01
                               ,goalReward = 1000, punish=-1, j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=base_environment, discount=0.9, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 49.99
iter = 1 -> delta = 842.24
iter = 2 -> delta = 757.67
iter = 3 -> delta = 673.83
iter = 4 -> delta = 605.79
iter = 5 -> delta = 544.38
iter = 6 -> delta = 489.79
iter = 7 -> delta = 440.7
iter = 8 -> delta = 396.6
iter = 9 -> delta = 356.93
iter = 10 -> delta = 321.23
iter = 11 -> delta = 289.1
iter = 12 -> delta = 260.19
iter = 13 -> delta = 234.17
iter = 14 -> delta = 210.75
iter = 15 -> delta = 189.68
iter = 16 -> delta = 170.71
Policy convergenced
Optimal Value
+------+------+------+------+------+------+------+------+------+------+------+-----+-----+-----+-----+
| 6833 | 6755 | 5750 | 4869 | 4098 | 3479 | ###  | ###  | 766  | 765  | 750  | 707 | 501 | 316 | 178 |
| 6755 | 6701 | 5727 | 4865 | 4097 | 3479 | ###  | ###  | 1038 | 1030 | 1000 | 748 | 518 | 319 | 179 |
| 5750 | 5727 | 5628 | 4826 | 4090 | 3475 | ###  | ###  | 1382 | 1365 | 1053 | 771 | 522 | 321 | 179 |
| 4868 | 4865 | 4826 | 4701 | 4038 | 3437 | ###  | ###  | 1795 | 1412 | 1076 | 775 | 523 | 3

## Part 2

In [58]:
without_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = 0
                                           ,stopActionPrice = 0, goalReward = 1000 , punish=-0.01
                                           , j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=without_friction_environment, discount=0.9, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 833.33
iter = 1 -> delta = 642.25
iter = 2 -> delta = 787.22
Policy convergenced
Optimal Value
+------+------+-----+-----+---+-----+-----+-----+---+---+---+---+-----+-----+-----+
| 1467 | 1448 | 552 | 487 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 1412 | 1399 | 528 | 475 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 516  | 516  | 487 | 456 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  20  |  19  |  18 |  12 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0   |  0   |  0  |  0  | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0   |  0   |  0  |  0  | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0   |  0   |  0  |  0  | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0   |  0   |  0  |  0  | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0   |  0   |  0  |  0  | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0   |  0   |  0  |  0  | 0 |  0  |  0  |  

## Part 3

In [59]:
with_extreme_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -1
                                               ,stopActionPrice = -0.8 ,goalReward = 100 , punish=-10
                                               ,j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 3}
agent = Agent(id=0, environment=with_extreme_friction_environment, discount=0.9, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 4.0
iter = 1 -> delta = 83.56
iter = 2 -> delta = 75.11
iter = 3 -> delta = 66.79
iter = 4 -> delta = 60.04
iter = 5 -> delta = 53.96
iter = 6 -> delta = 48.54
iter = 7 -> delta = 43.68
iter = 8 -> delta = 39.31
iter = 9 -> delta = 35.38
iter = 10 -> delta = 31.84
iter = 11 -> delta = 28.65
iter = 12 -> delta = 25.79
iter = 13 -> delta = 23.21
iter = 14 -> delta = 20.89
Policy convergenced
Optimal Value
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+----+-----+-----+-----+
| 640 | 632 | 532 | 443 | 366 | 304 | ### | ### |  38 |  38 | 36 | 33 |  16 |  3  |  -3 |
| 632 | 627 | 529 | 443 | 366 | 304 | ### | ### |  63 |  62 | 59 | 36 |  17 |  3  |  -3 |
| 532 | 529 | 519 | 439 | 365 | 304 | ### | ### |  95 |  94 | 64 | 38 |  18 |  3  |  -3 |
| 443 | 443 | 439 | 426 | 360 | 300 | ### | ### | 136 |  98 | 66 | 39 |  18 |  3  |  -3 |
| 366 | 366 | 365 | 360 | 346 | 292 | 240 | 185 | 139 | 100 | 66 | 39 |  18 |  3  |  -3 |
| 297 | 297 | 297 | 296 | 290 | 2

## Part 4 - 1

In [60]:
base_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -0.01, stopActionPrice = -0.01
                               ,goalReward = 1000, punish=-1, j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=base_environment, discount=0.75, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 849.99
iter = 1 -> delta = 1333.46
iter = 2 -> delta = 867.61
iter = 3 -> delta = 564.09
iter = 4 -> delta = 367.09
iter = 5 -> delta = 248.23
iter = 6 -> delta = 186.17
iter = 7 -> delta = 122.42
iter = 8 -> delta = 91.81
iter = 9 -> delta = 66.3
iter = 10 -> delta = 49.67
iter = 11 -> delta = 36.98
iter = 12 -> delta = 27.73
iter = 13 -> delta = 20.77
iter = 14 -> delta = 15.58
Policy convergenced
Optimal Value
+------+------+------+------+------+-----+-----+-----+-----+-----+-----+----+-----+-----+-----+
| 3317 | 3252 | 2296 | 1618 | 1137 | 817 | ### | ### |  62 |  62 |  60 | 56 |  32 |  14 |  5  |
| 3252 | 3207 | 2278 | 1616 | 1137 | 817 | ### | ### | 101 | 100 |  96 | 60 |  33 |  15 |  5  |
| 2296 | 2278 | 2211 | 1592 | 1133 | 816 | ### | ### | 161 | 158 | 103 | 63 |  34 |  15 |  5  |
| 1618 | 1616 | 1592 | 1522 | 1108 | 802 | ### | ### | 248 | 165 | 106 | 63 |  34 |  15 |  5  |
| 1136 | 1136 | 1132 | 1107 | 1045 | 771 | 558 | 379 | 254 | 168 | 107 | 63 |  34 |

In [61]:
base_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -0.01, stopActionPrice = -0.01
                               ,goalReward = 1000, punish=-1, j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=base_environment, discount=0.5, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 849.99
iter = 1 -> delta = 1155.64
iter = 2 -> delta = 501.16
iter = 3 -> delta = 217.16
iter = 4 -> delta = 94.19
iter = 5 -> delta = 42.44
iter = 6 -> delta = 21.22
iter = 7 -> delta = 8.48
iter = 8 -> delta = 4.14
iter = 9 -> delta = 1.98
iter = 10 -> delta = 0.96
iter = 11 -> delta = 0.48
iter = 12 -> delta = 0.24
iter = 13 -> delta = 0.12
iter = 14 -> delta = 0.06
Policy convergenced
Optimal Value
+------+------+-----+-----+-----+-----+-----+-----+---+---+---+---+-----+-----+-----+
| 1691 | 1644 | 748 | 340 | 155 |  73 | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 1644 | 1612 | 737 | 339 | 154 |  73 | ### | ### | 1 | 1 | 1 | 0 |  0  |  0  |  0  |
| 748  | 737  | 705 | 330 | 154 |  73 | ### | ### | 2 | 2 | 1 | 0 |  0  |  0  |  0  |
| 340  | 339  | 330 | 309 | 148 |  71 | ### | ### | 6 | 2 | 1 | 0 |  0  |  0  |  0  |
| 155  | 154  | 154 | 148 | 135 |  66 |  32 |  14 | 6 | 2 | 1 | 0 |  0  |  0  |  0  |
|  70  |  70  |  70 |  69 |  66 |  59 |  29 |  14 | 6 | 2 

In [62]:
base_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -0.01, stopActionPrice = -0.01
                               ,goalReward = 1000, punish=-1, j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=base_environment, discount=0.1, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 833.32
iter = 1 -> delta = 730.2
iter = 2 -> delta = 742.21
iter = 3 -> delta = 74.06
iter = 4 -> delta = 7.31
iter = 5 -> delta = 0.73
iter = 6 -> delta = 0.07
iter = 7 -> delta = 0.01
iter = 8 -> delta = 0.0
Value convergenced
Optimal Value
+-----+-----+----+---+---+-----+-----+-----+---+---+---+---+-----+-----+-----+
| 943 | 921 | 80 | 7 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 921 | 906 | 78 | 6 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  80 |  78 | 75 | 6 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  7  |  6  | 6  | 6 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0  | 0 | 0 |

## Part 4 - 2

In [67]:
without_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = 0
                                           ,stopActionPrice = 0, goalReward = 1000 , punish=-0.01
                                           , j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=without_friction_environment, discount=0.99, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 50.0
iter = 1 -> delta = 846.48
Policy convergenced
Optimal Value
+-----+-----+----+---+---+-----+-----+-----+---+---+---+---+-----+-----+-----+
| 896 | 877 | 28 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 877 | 864 | 27 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  28 |  27 | 18 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 

In [63]:
without_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = 0
                                           ,stopActionPrice = 0, goalReward = 1000 , punish=-0.01
                                           , j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=without_friction_environment, discount=0.75, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 50.0
iter = 1 -> delta = 835.21
Policy convergenced
Optimal Value
+-----+-----+----+---+---+-----+-----+-----+---+---+---+---+-----+-----+-----+
| 885 | 866 | 21 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 866 | 854 | 20 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  21 |  20 | 13 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 

In [64]:
without_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = 0
                                           ,stopActionPrice = 0, goalReward = 1000 , punish=-0.01
                                           , j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=without_friction_environment, discount=0.5, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 50.0
iter = 1 -> delta = 823.47
Policy convergenced
Optimal Value
+-----+-----+----+---+---+-----+-----+-----+---+---+---+---+-----+-----+-----+
| 873 | 855 | 14 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 855 | 843 | 13 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  14 |  13 | 9  | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0  | 0 | 0 |  0  |  0  |  0  | 

In [65]:
without_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = 0
                                           ,stopActionPrice = 0, goalReward = 1000 , punish=-0.01
                                           , j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 0.001}
agent = Agent(id=0, environment=without_friction_environment, discount=0.1, theta=theta)

agent.policy_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 50.0
iter = 1 -> delta = 804.69
Policy convergenced
Optimal Value
+-----+-----+---+---+---+-----+-----+-----+---+---+---+---+-----+-----+-----+
| 854 | 837 | 2 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
| 837 | 826 | 2 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  2  |  2  | 1 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0 | 0 | 0 |  0  | ### | ### | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0 | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0 | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0 | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0 | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0 | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 | ### | ### | ### |
|  0  |  0  | 0 | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 0 |  0  |  0  |  0  |
|  0  |  0  | 0 | 0 | 0 |  0  |  0  |  0  | 0 | 0 | 0 | 

## Part 5

In [56]:
base_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -0.01, stopActionPrice = -0.01
                               ,goalReward = 1000, punish=-1, j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 3}
agent = Agent(id=0, environment=base_environment, discount=0.9, theta=theta)

agent.value_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 849.99
iter = 1 -> delta = 762.24
iter = 2 -> delta = 677.75
iter = 3 -> delta = 609.21
iter = 4 -> delta = 547.43
iter = 5 -> delta = 492.53
iter = 6 -> delta = 443.16
iter = 7 -> delta = 398.81
iter = 8 -> delta = 358.92
iter = 9 -> delta = 323.02
iter = 10 -> delta = 290.71
iter = 11 -> delta = 261.64
iter = 12 -> delta = 235.48
iter = 13 -> delta = 211.93
iter = 14 -> delta = 190.74
iter = 15 -> delta = 171.66
iter = 16 -> delta = 154.5
iter = 17 -> delta = 139.05
iter = 18 -> delta = 125.14
iter = 19 -> delta = 112.63
iter = 20 -> delta = 101.36
iter = 21 -> delta = 91.23
iter = 22 -> delta = 82.11
iter = 23 -> delta = 73.89
iter = 24 -> delta = 66.51
iter = 25 -> delta = 59.85
iter = 26 -> delta = 53.87
iter = 27 -> delta = 48.48
iter = 28 -> delta = 43.63
iter = 29 -> delta = 39.27
iter = 30 -> delta = 35.34
iter = 31 -> delta = 31.81
iter = 32 -> delta = 28.63
iter = 33 -> delta = 25.77
iter = 34 -> delta = 23.19
iter = 35 -> delta = 20.87
iter = 36 -> delta

In [69]:
without_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = 0
                                           ,stopActionPrice = 0, goalReward = 1000 , punish=-0.01
                                           , j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 3}
agent = Agent(id=0, environment=without_friction_environment, discount=0.9, theta=theta)

agent.value_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 850.0
iter = 1 -> delta = 762.25
iter = 2 -> delta = 677.76
iter = 3 -> delta = 609.21
iter = 4 -> delta = 547.44
iter = 5 -> delta = 492.54
iter = 6 -> delta = 443.17
iter = 7 -> delta = 398.82
iter = 8 -> delta = 358.92
iter = 9 -> delta = 323.02
iter = 10 -> delta = 290.72
iter = 11 -> delta = 261.64
iter = 12 -> delta = 235.48
iter = 13 -> delta = 211.93
iter = 14 -> delta = 190.74
iter = 15 -> delta = 171.66
iter = 16 -> delta = 154.5
iter = 17 -> delta = 139.05
iter = 18 -> delta = 125.14
iter = 19 -> delta = 112.63
iter = 20 -> delta = 101.37
iter = 21 -> delta = 91.23
iter = 22 -> delta = 82.11
iter = 23 -> delta = 73.9
iter = 24 -> delta = 66.51
iter = 25 -> delta = 59.86
iter = 26 -> delta = 53.87
iter = 27 -> delta = 48.48
iter = 28 -> delta = 43.63
iter = 29 -> delta = 39.27
iter = 30 -> delta = 35.34
iter = 31 -> delta = 31.81
iter = 32 -> delta = 28.63
iter = 33 -> delta = 25.77
iter = 34 -> delta = 23.19
iter = 35 -> delta = 20.87
iter = 36 -> delta =

In [70]:
with_extreme_friction_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -1
                                               ,stopActionPrice = -0.8 ,goalReward = 100 , punish=-10
                                               ,j_limit = 15, i_limit = 15, p = 0.8, container=None)

theta = {"max_iter": 50, "delta_treshold": 3}
agent = Agent(id=0, environment=with_extreme_friction_environment, discount=0.9, theta=theta)

agent.value_iteration()

print("Optimal Value")
agent.print_value()

print("\nOptimal Policy")
agent.print_policy()

iter = 0 -> delta = 84.2
iter = 1 -> delta = 75.48
iter = 2 -> delta = 67.11
iter = 3 -> delta = 60.31
iter = 4 -> delta = 54.2
iter = 5 -> delta = 48.76
iter = 6 -> delta = 43.87
iter = 7 -> delta = 39.48
iter = 8 -> delta = 35.53
iter = 9 -> delta = 31.98
iter = 10 -> delta = 28.78
iter = 11 -> delta = 25.9
iter = 12 -> delta = 23.31
iter = 13 -> delta = 20.98
iter = 14 -> delta = 18.88
iter = 15 -> delta = 16.99
iter = 16 -> delta = 15.3
iter = 17 -> delta = 13.77
iter = 18 -> delta = 12.39
iter = 19 -> delta = 11.15
iter = 20 -> delta = 10.04
iter = 21 -> delta = 9.03
iter = 22 -> delta = 8.13
iter = 23 -> delta = 7.32
iter = 24 -> delta = 6.58
iter = 25 -> delta = 5.93
iter = 26 -> delta = 5.33
iter = 27 -> delta = 4.8
iter = 28 -> delta = 4.32
iter = 29 -> delta = 3.89
iter = 30 -> delta = 3.5
iter = 31 -> delta = 3.15
iter = 32 -> delta = 2.83
Value convergenced
Optimal Value
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 803 | 795 