## Cristian Camilo Moreno Mojica

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages')
import gym
from off_policy import OffPolicyControl
import numpy as np

# Gridworld

In [3]:
from grid import MDPs

In [4]:
locked_cell = {(2,1):'*',
               (2,2):'*',
               (2,3):'*',
               (2,4):'*',
               (2,6):'*',
               (2,7):'*',
               (2,8):'*',
               (3,4):'*',
               (4,4):'*',
               (5,4):'*',
               (6,4):'*',
               (7,4):'*'
              }
reward = {(4,5):-1,
         (5,5):1,
         (7,5):-1,
         (7,6):-1}

In [5]:
mdp = MDPs( dimensions=(10,10),
      locked_cell=locked_cell,
      initial_rewards=reward,
      chance_moving={'down':0.2,
                    'left':0.2,
                    'up':0.3,
                    'right':0.3},
      be_same_place_bool=True,
      win_cell=(5,5),
      reward_move={'up': -1,
                   'left': -1,
                   'right': -1,
                   'down': -1})

In [6]:
mdp.board

array([[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
       [' ', '*', '*', '*', '*', ' ', '*', '*', '*', ' '],
       [' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']], dtype='<U1')

In [7]:
q = np.zeros((100, 4))

In [8]:
off_policy_gridworld = OffPolicyControl(env=mdp,
                              epsilon=0.5,
                              Q=q,
                              alpha=0.001,
                              gamma=1,
                             )

In [9]:
off_policy_gridworld.iter_episode(episodes=100)

In [10]:
print(off_policy_gridworld.Q)

[[ 1.49004980e+00  1.19461537e+00  1.29352076e+00  5.73689531e+00]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.00000000e-03]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.99900000e-03]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.99900000e-03 -1.00000000e-03  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03 -1.99900000e-03  0.00000000e+00]
 [ 0.00000000e+00 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03  0.00000000e+00  0.00000000e+00 -1.99900000e-03]
 [ 0.00000000e+00 -1.00000000e-03 -1.00000000e-03 -1.99900000e-03]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03 -1.00000000e-03  0.00000000e+00]
 [-1.99900000e-03  0.00000000e+00  0.00000000e+00 -1.00000000e

# Laberinto de cuartos


In [11]:
locked_cell = {(0,0):'*',
(0,1):'*',
(0,2):'*',
(0,4):'*',
(0,5):'*',
(0,6):'*',
(0,7):'*',
(0,8):'*',
(0,9):'*',
(0,10):'*',
(0,11):'*',
(0,12):'*',
(6,0):'*',
(6,1):'*',
(6,2):'*',
(6,4):'*',
(6,5):'*',
(6,6):'*',
(6,7):'*',
(6,8):'*',
(6,10):'*',
(6,11):'*',
(6,12):'*',
(12,0):'*',
(12,1):'*',
(12,2):'*',
(12,3):'*', 
(12,4):'*',
(12,5):'*',
(12,6):'*',
(12,7):'*',
(12,8):'*',
(12,9):'*',
(12,10):'*',
(12,11):'*',
(12,12):'*',
(1,0):'*',
(2,0):'*',
(3,0):'*',
(4,0):'*',
(5,0):'*',
(6,0):'*',
(7,0):'*',
(8,0):'*',
(9,0):'*',
(10,0):'*',
(11,0):'*',
(12,0):'*',
(0,6):'*',
(1,6):'*',
(2,6):'*',
(4,6):'*',
(5,6):'*',
(6,6):'*',
(7,6):'*',
(8,6):'*',
(10,6):'*',
(11,6):'*',
(12,6):'*',
(0,12):'*',
(1,12):'*',
(2,12):'*',
(3,12):'*',
(4,12):'*',
(5,12):'*',
(6,12):'*',
(7,12):'*',
(8,12):'*',
(9,12):'*',
(10,12):'*',
(11,12):'*',
(12,12):'*'
              }
reward = {(0,3):10,
         (1,3):2,
         (2,3):1,
         (3,3):1}

In [12]:
maze = MDPs( dimensions=(13,13),
      locked_cell=locked_cell,
      initial_rewards=reward,
      chance_moving={'down':0.2,
                    'left':0.2,
                    'up':0.3,
                    'right':0.3},
      be_same_place_bool=True,
      win_cell=(0,3),
      reward_move={'up': -1,
                   'left': -1,
                   'right': -1,
                   'down': -1},
    initial_position=(10,9))

In [13]:
maze.board

array([['*', '*', '*', ' ', '*', '*', '*', '*', '*', '*', '*', '*', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', '*', '*', ' ', '*', '*', '*', '*', '*', ' ', '*', '*', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ', ' ', '*'],
       ['*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*']],
      dtype='<U1')

In [14]:
q_maze = np.zeros((169, 4))

In [15]:
off_policy_maze = OffPolicyControl(env=maze,
                              epsilon=0.05,
                              Q=q_maze,
                              alpha=0.001,
                              gamma=1,
                             )

In [16]:
off_policy_maze.iter_episode(episodes=2000)

In [17]:
print(off_policy_gridworld.Q)

[[ 1.49004980e+00  1.19461537e+00  1.29352076e+00  5.73689531e+00]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.00000000e-03]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.99900000e-03]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.99900000e-03 -1.00000000e-03  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03 -1.99900000e-03  0.00000000e+00]
 [ 0.00000000e+00 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03  0.00000000e+00  0.00000000e+00 -1.99900000e-03]
 [ 0.00000000e+00 -1.00000000e-03 -1.00000000e-03 -1.99900000e-03]
 [-1.00000000e-03 -1.00000000e-03  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-03 -1.00000000e-03 -1.00000000e-03  0.00000000e+00]
 [-1.99900000e-03  0.00000000e+00  0.00000000e+00 -1.00000000e

# Taxi

In [18]:
env_taxi = gym.make("Taxi-v2")
env_taxi.render()

[2023-04-10 12:30:41,482] Making new env: Taxi-v2


+---------+
|R:[43m [0m| : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



  result = entry_point.load(False)


In [19]:
nuumero_de_estados = env_taxi.observation_space.n
numero_acciones = env_taxi.action_space.n

## Parametros

In [20]:
q = np.ones((nuumero_de_estados, numero_acciones))

In [21]:
off_policy = OffPolicyControl(env=env_taxi,
                              epsilon=0.05,
                              Q=q,
                              alpha=0.001,
                              gamma=1,
                              reward_modified = {-1:1,20:5,-10:-10}
                             )

In [22]:
off_policy.iter_episode(episodes=5000)
print(off_policy.Q)

[[1.         1.         1.         1.         1.         1.        ]
 [1.01456665 1.         1.00100301 1.         1.         1.        ]
 [1.01515304 1.00101109 1.         1.         1.         1.        ]
 ...
 [2.827      1.01608279 1.03250001 1.04020581 0.85957973 0.92804545]
 [2.727      1.01907814 1.02505686 1.03273982 0.8706442  0.95413718]
 [1.         1.0150149  1.         1.         1.         1.        ]]
