# Install Libraries

In [None]:
!pip install gym

In [None]:
!git clone https://github.com/zhpinkman/armed-bandit.git

In [None]:
!pip install ./armed-bandit

# Impor Libraries

In [1]:
from amalearn.reward import RewardBase
from amalearn.agent import AgentBase

In [2]:
from amalearn.environment import EnvironmentBase
import gym




In [471]:
import random
from copy import deepcopy
import matplotlib.pyplot as plt
import numpy as np
from prettytable import PrettyTable

# Environment

In [437]:
from gym.spaces import Discrete, Box

# Action:
# 0 1 2
# 3 4 5
# 6 7 8

class Environment(EnvironmentBase):
    def __init__(self, obstacle = [] ,id = 0, action_count=9, actionPrice = -1, goalReward = 100
                 , punish=-10, j_limit = 10, i_limit = 10, p = 0.8, container=None):
        """
        initialize your variables
        """
        
        self.obstacle = obstacle
        
        self.x_min = 1
        self.x_max = i_limit
        
        self.y_min = 1
        self.y_max = j_limit
        
        self.reset()
        
        self.action_count = action_count
        self.actionPrice = actionPrice
        self.goalReward = goalReward
        self.punish = punish
        self.p = p
        
        action_space = Discrete(action_count)
        state_space = Box(low=1, high=max(i_limit, j_limit), shape=(1,2), dtype=int)
        
        self.action_list = list(range(1,10))
        self.state_list = []
        
        for i in range(1, i_limit+1):
            for j in range(1, j_limit+1):
                self.state_list.append(np.array([i, j]))
        
        super(Environment, self).__init__(action_space=action_space, state_space=state_space, id=id ,container=container)

        
    def isStatePossible(self, state):
        """if given state is possible (not out of the grid and not obstacle) return ture"""
        if self.x_min <= state[0] <= self.x_max and self.y_min <= state[1] <= self.y_max:
            for obstacle_item in self.obstacle:
                if (state==obstacle_item).all():
                    return False
            return True
        else:
            return False
    
    
    def isAccessible(self, state, state_p):
        """if given state is Accesible (we can reach state_p by doing an action from state) return true"""
        return abs(state[0]-state_p[0]) <= 1 and abs(state[1] - state_p[1]) <= 1 and self.isStatePossible(state_p)
            
    def getTransitionStatesAndProbs(self, state, action, state_p):
        """return probability of transition or T(sp,a,s)"""
        
        actions = self.available_actions_state(state)
        
        if (self.calculate_next_state(state,action)==state_p).all():
            if self.isAccessible(state, state_p):
                return self.p + (1-self.p) / len(actions)
            else:
                return self.p
        else:
            if self.isAccessible(state, state_p):
                return (1-self.p) / len(actions)
            else:
                return 0

    
    def getReward(self, state, action, state_p):
        """return reward of transition"""
        if self.terminated_state(state_p):
            return self.actionPrice + self.goalReward
        
        if self.isStatePossible(state_p):
            return self.actionPrice
        else:
            return self.actionPrice + self.punish
        
    def sample_all_rewards(self):
        return 
    
    def calculate_reward(self, action):
        return self.getReward(self.current_state, action, self.calculate_next_state(self.current_state, action))

    def available_states_state(self, state):
        states = []
        for i in [-1, 0, +1]:
            for j in [-1, 0, +1]:
                new_state = np.array([state[0]+i, state[1]+j])
                
                if self.isAccessible(state, new_state):
                    states.append(new_state)
        return states
    
    def terminated(self):
        return self.terminated_state(self.current_state)
    
    def terminated_state(self, state):
        return (state==np.array([1,1])).all()
        
    def observe(self):
        return self.current_state 

    def available_actions(self):
        return self.available_actions_state(self.current_state)
    
    def available_actions_state(self, state):
        output_actions = []
        for action in range(self.action_count):
            next_state = self.calculate_next_state(state, action)
            
            if self.isAccessible(state, next_state):
                output_actions.append(action)
        
        return output_actions
        
    
    def calculate_next_state(self, state, action):
        return np.array([state[0] + (action%3 -1), state[1] + (int(action/3)-1) ])
        
    def next_state(self, action):
        actions = self.available_actions()
        
        if action not in actions:
            actions.append(action)
                
        probabilities = []
                
        for action2 in actions:
            state2 = self.calculate_next_state(self.current_state, action2)
            probabilities.append(self.getTransitionStatesAndProbs(self.current_state, action, state2))
        
        final_action = random.choices(population=actions, weights=probabilities, k=1)[0]
        
        real_next_state = self.calculate_next_state(self.current_state, final_action)
        
        if not self.isStatePossible(real_next_state):
            real_next_state = self.current_state
        
        self.last_action = action
        
        self.sliped = not (final_action==action)
        
        self.current_state = real_next_state
        
        return

    def reset(self):
        self.current_state = np.array([15, 15])
        
        self.last_action = None
        self.sliped = None

    def render(self, mode='human'):
        print(f"{self.current_state} \t {self.last_action} \t {self.sliped}")
        return 

    def close(self):
        return

In [438]:
grid_states =  [np.array([7, 1]), np.array([8, 1]), np.array([7, 2]), np.array([8, 2])
                ,np.array([7, 3]), np.array([8, 3]), np.array([7, 4]), np.array([8, 4])
                ,np.array([13, 8]), np.array([14, 8]), np.array([15, 8])
                ,np.array([13, 9]), np.array([14, 9]), np.array([15, 9])
                ,np.array([6, 12]), np.array([7, 12]), np.array([6, 13]), np.array([7, 13])
                ,np.array([6, 14]), np.array([7, 14]), np.array([6, 15]), np.array([7, 15])]

In [439]:
base_environment = Environment(obstacle = grid_states ,id = 0, action_count=9, actionPrice = -1, goalReward = 100
                                , punish=-10, j_limit = 15, i_limit = 15, p = 0.8, container=None)

# Agent

In [496]:
import numpy as np

class Agent(AgentBase):
    def __init__(self, id, environment, discount, theta):
        
        # initialize a random policy and V(s) = 0 for each state
        self.environment = environment
        
        # mapp states to its ids
#         self.mapp = {}
        
        # init V
        self.V = {}
        
        # init policy
        self.policy = {}
        
        super(Agent, self).__init__(id, environment)
        
        self.discount = discount
        
        self.theta = theta
                
        self.value_initialization()
        
        self.policy_initialization()
    
    def value_initialization(self):
        for state in self.environment.state_list:
            self.V[tuple(state)] = 0
        
    def policy_initialization(self):
        for state in self.environment.state_list:
            self.policy[tuple(state)] = random.choice(self.environment.action_list)
        
    def policy_evaluation(self):
        pass
    
    def policy_improvement(self):
        pass
    
    def value_iteration(self):
        for iter in range(self.theta["max_iter"]):
            new_V = {}

            delta = 0
            for state in self.environment.state_list:
                new_V[tuple(state)] = -math.inf

                available_actions = self.environment.available_actions_state(state)
                available_states  = self.environment.available_states_state(state)

                for action in available_actions:
                    sum = 0
                    for state_p in available_states:
                        p_sp = self.environment.getTransitionStatesAndProbs(state, action, state_p)
                        r_sp = self.environment.getReward(state, action, state_p)
                        v_sp = self.V[tuple(state_p)]

                        sum += p_sp * (r_sp + self.discount * v_sp)

                    new_V[tuple(state)] = max(new_V[tuple(state)], sum)
                delta = max(delta, abs(self.V[tuple(state)] - new_V[tuple(state)]))
                
            print(f"iter = {iter} -> delta = {round(delta, 2)}")
            self.V = deepcopy(new_V)

            if delta < self.theta["delta_treshold"]:
                break
    
    def policy_extraction(self):
        for state in self.environment.state_list:

            available_actions = self.environment.available_actions_state(state)
            available_states  = self.environment.available_states_state(state)
            
            max_value = -math.inf
            argmax = None
            
            for action in available_actions:
                sum = 0
                for state_p in available_states:
                    p_sp = self.environment.getTransitionStatesAndProbs(state, action, state_p)
                    v_sp = self.V[tuple(state_p)]

                    sum += p_sp * v_sp
            
                if (state==np.array([1,1])).all():
                    print(action, sum)
            
                if sum > max_value:
                    max_value = sum
                    argmax = action
                    
            self.policy[tuple(state)] = action
    
    def print_value(self):
        p = PrettyTable()

        for j in range(1,16):
            row = []
            for i in range(1, 16):
                row.append(int(self.V[(i,j)]))
            p.add_row(row)

        print (p.get_string(header=False, border=True))
    
    def print_policy(self):
        p = PrettyTable()

        for j in range(1,16):
            row = []
            for i in range(1, 16):
                row.append(int(self.policy[(i,j)]))
            p.add_row(row)

        print (p.get_string(header=False, border=True))
    
    def take_action(self) -> (object, float, bool, object):
        # in this method, you MUST call the `step` method of 
        # the environment and observe the results and return them like:
        # return observation, reward, done, info

        return self.environment.step(random.choice(self.environment.action_list))

In [497]:
theta = {"max_iter": 5, "delta_treshold": 5}
agent = Agent(id=0, environment=base_environment, discount=0.9, theta=theta)

In [None]:
agent.value_iteration()

iter = 0 -> delta = 84.0
iter = 1 -> delta = 75.33


In [None]:
agent.print_value()

In [None]:
agent.policy_extraction()

In [None]:
agent.print_privacy()

In [486]:
from prettytable import PrettyTable

p = PrettyTable()

for j in range(1,16):
    row = []
    for i in range(1, 16):
        row.append(int(agent.V[(i,j)]))
    p.add_row(row)

print (p.get_string(header=False, border=True))

+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 783 | 776 | 675 | 587 | 510 | 448 | 398 | 174 | 174 | 174 | 173 | 168 | 145 | 123 | 104 |
| 776 | 770 | 673 | 587 | 510 | 448 | 398 | 203 | 203 | 202 | 199 | 172 | 147 | 123 | 104 |
| 675 | 673 | 663 | 583 | 509 | 448 | 398 | 239 | 238 | 236 | 204 | 175 | 148 | 124 | 104 |
| 587 | 587 | 583 | 570 | 504 | 444 | 390 | 326 | 279 | 241 | 207 | 175 | 148 | 124 | 105 |
| 509 | 509 | 509 | 504 | 490 | 436 | 384 | 329 | 283 | 243 | 207 | 175 | 148 | 124 | 105 |
| 441 | 441 | 441 | 440 | 434 | 421 | 375 | 330 | 284 | 243 | 207 | 176 | 148 | 125 | 106 |
| 382 | 382 | 382 | 381 | 380 | 374 | 360 | 322 | 283 | 243 | 207 | 176 | 150 | 125 | 106 |
| 329 | 329 | 329 | 329 | 328 | 327 | 321 | 308 | 275 | 242 | 207 | 177 | 151 | 125 | 106 |
| 283 | 283 | 283 | 283 | 283 | 282 | 280 | 274 | 262 | 234 | 205 | 177 | 151 | 125 | 105 |
| 242 | 242 | 242 | 242 | 242 | 242 | 241 | 239 | 234 | 222 | 199 | 174 | 149 | 

In [487]:
from prettytable import PrettyTable

p = PrettyTable()

for j in range(1,16):
    row = []
    for i in range(1, 16):
        row.append(int(agent.policy[(i,j)]))
    p.add_row(row)

print (p.get_string(header=False, border=True))

+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
| 8 | 8 | 8 | 8 | 8 | 7 | 6 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 8 | 7 | 6 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 8 | 7 | 6 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 | 6 | 5 | 4 |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 | 6 | 2 | 1 |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 7 | 6 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 7 | 6 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 7 | 6 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 8 | 8 | 8 | 8 | 7 | 6 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 7 |
| 5 | 5 | 5 | 5 | 4 | 3 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 4 |
+---+---