## MDP Tests

### Imports

In [1]:
from random_mdp_factory import MDPFactory
from enums import MDPTransitionType, MDPRewardType

In [2]:
import numpy as np
import warnings

In [3]:
warnings.filterwarnings(action="ignore")

### Transition S Deterministic, Reward S

In [4]:
factory_sd_s = MDPFactory(MDPTransitionType.S_DETERMINISTIC, MDPRewardType.S, 10, 4, 5)

In [5]:
print("Terminate :", factory_sd_s.terminate_s_flags)
print("Transitions :", factory_sd_s.transitions)
print("Rewards :", factory_sd_s.rewards)

Terminate : [False False  True False False False False  True False False]
Transitions : [6 7 1 6 2 0 4 8 5 6]
Rewards : [-2 -3  1 -3 -2  0 -1  1 -2 -3]


In [6]:
factory_sd_s.train_policy()
factory_sd_s.play()

. => 0 None False False {}
0 => 6 -2 False False {}
0 => 4 -1 False False {}
0 => 2 -2 True False {}


### Transition S Probabilistic, Reward S

In [7]:
factory_sp_s = MDPFactory(MDPTransitionType.S_PROBABILISTIC, MDPRewardType.S, 10, 5, 10)

In [8]:
print("Terminate :", factory_sp_s.terminate_s_flags)
print("Rewards :", factory_sp_s.rewards)
print("Transitions :\n", factory_sp_s.transitions)

Terminate : [False False False False False  True False False False False]
Rewards : [-6 -4 -8 -5 -5  1 -3 -8 -8  0]
Transitions :
 [[0.13846154 0.12307692 0.07692308 0.07692308 0.12307692 0.07692308
  0.12307692 0.07692308 0.09230769 0.09230769]
 [0.06818182 0.06818182 0.04545455 0.04545455 0.18181818 0.18181818
  0.06818182 0.09090909 0.04545455 0.20454545]
 [0.14285714 0.10204082 0.08163265 0.02040816 0.18367347 0.16326531
  0.10204082 0.06122449 0.         0.14285714]
 [0.07692308 0.07692308 0.         0.19230769 0.         0.03846154
  0.19230769 0.11538462 0.         0.30769231]
 [0.11764706 0.23529412 0.05882353 0.11764706 0.         0.11764706
  0.05882353 0.05882353 0.11764706 0.11764706]
 [0.18367347 0.08163265 0.10204082 0.10204082 0.04081633 0.02040816
  0.06122449 0.18367347 0.04081633 0.18367347]
 [0.08928571 0.03571429 0.10714286 0.16071429 0.14285714 0.14285714
  0.03571429 0.16071429 0.05357143 0.07142857]
 [0.09090909 0.09090909 0.18181818 0.18181818 0.         0.04545

In [9]:
factory_sp_s.play()

. => 0 None False False {}
0 => 7 -6 False False {}
0 => 3 -8 False False {}
0 => 9 -5 False False {}
0 => 6 0 False False {}
0 => 1 -3 False False {}
0 => 9 -4 False False {}
0 => 7 0 False False {}
0 => 8 -8 False False {}
0 => 0 -8 False False {}
0 => 4 -6 False False {}
0 => 2 -5 False False {}
0 => 9 -8 False False {}
0 => 6 0 False False {}
0 => 7 -3 False False {}
0 => 3 -8 False False {}
0 => 3 -5 False False {}
0 => 6 -5 False False {}
0 => 0 -3 False False {}
0 => 8 -6 False False {}
0 => 5 -8 True False {}


### Transition SA Deterministic, Reward SA

In [10]:
factory_sad_sa = MDPFactory(MDPTransitionType.SA_DETERMINISTIC, MDPRewardType.SA, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sad_sa.terminate_s_flags)
print("\nRewards :\n", factory_sad_sa.rewards)
print("\nTransitions :\n", factory_sad_sa.transitions)

Terminate : [False False False False False False False False False  True]

Rewards :
 [[-7 -1 -1 -7  0]
 [-3 -6 -7 -2  1]
 [-8 -8 -1 -3  1]
 [-3 -3 -5 -2 -8]
 [-5 -3 -3 -3 -1]
 [-5  0 -8 -7  1]
 [-1 -7 -6 -8  1]
 [-5 -4 -2 -3 -4]
 [-4 -4 -4 -3  0]
 [-5 -1 -7 -2  0]]

Transitions :
 [[3 3 1 6 7]
 [2 1 5 0 9]
 [5 8 1 8 9]
 [2 7 0 7 2]
 [0 2 2 2 6]
 [1 6 5 4 9]
 [4 2 2 2 9]
 [7 8 2 4 1]
 [8 5 4 8 7]
 [3 5 4 5 1]]


In [11]:
factory_sad_sa.play()

. => 0 None False False {}
0 => 3 -7 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {}
0 => 1 -5 False False {}
0 => 2 -3 False False {}
0 => 5 -8 False False {

In [12]:
factory_sad_sa.train_policy()
factory_sad_sa.get_policy()

array([2, 4, 4, 0, 4, 4, 4, 2, 4, 4])

In [13]:
factory_sad_sa.play()

. => 1 None False False {}
4 => 7 0 False False {}
2 => 2 -2 False False {}
4 => 9 1 True False {}


### Transition SA Probabilistic, Reward SAS

In [14]:
factory_sap_sas = MDPFactory(MDPTransitionType.SA_PROBABILISTIC, MDPRewardType.SAS, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sap_sas.terminate_s_flags)
print("\nRewards :\n", factory_sap_sas.rewards)
print("\nTransitions :\n", factory_sap_sas.transitions)

Terminate : [False False False False False False False False False  True]

Rewards :
 [[[-2 -1 -3 -8 -4 -5 -7 -7 -6  1]
  [-6  0  0 -2 -6 -2 -7 -8 -5  1]
  [-6 -5  0 -6 -2 -6 -1 -5 -5  1]
  [-7 -6 -8  0 -8 -2 -4 -5 -7  1]
  [-1 -8 -7 -1 -1 -2 -6 -7 -5  1]]

 [[-6 -8 -6 -5 -1 -8 -1 -7  0  1]
  [-5 -1 -3 -6  0 -4 -1 -3 -6  1]
  [-7 -4 -6 -1 -3  0 -3  0 -4  1]
  [-1 -1 -2 -6 -3 -2 -7 -5 -5  1]
  [ 0 -5 -8 -3 -1 -1 -5 -1 -1  1]]

 [[-7 -7 -4 -6 -8 -7  0 -4 -2  1]
  [-8 -3 -5 -1 -1 -5 -1 -4 -2  1]
  [-2 -4 -8 -7 -4 -1 -4 -8 -3  1]
  [ 0 -6 -7 -8 -6  0 -2 -7 -4  1]
  [-8 -8 -8 -8 -6 -3 -1 -5 -4  1]]

 [[-6 -4 -1 -7 -4 -4 -4 -4 -2  1]
  [-1 -6 -1 -4 -6 -2 -1 -8 -8  1]
  [-2 -4 -4 -6 -1 -2 -3 -6 -1  1]
  [-8 -8 -6 -2 -5  0 -2  0 -7  1]
  [-2 -2 -5  0 -8  0 -7 -4  0  1]]

 [[-8 -8 -4 -3 -6 -4 -1 -3 -8  1]
  [-7 -6 -2 -3 -4 -1 -5 -4  0  1]
  [ 0 -4 -6 -7 -5 -5 -7 -5 -2  1]
  [-2 -1 -2 -2 -3 -7 -6  0 -7  1]
  [-4 -4 -3 -4 -3 -8 -5 -7 -7  1]]

 [[-6  0  0 -4 -3 -2 -6  0 -7  1]
  [-6 -5  0 -3 -4 -3

In [15]:
factory_sap_sas.play()

. => 0 None False False {}
0 => 7 -2 False False {}
0 => 3 -2 False False {}
0 => 8 -7 False False {}
0 => 6 -5 False False {}
0 => 0 -3 False False {}
0 => 9 -2 True False {}


In [16]:
factory_sap_sas.train_policy()
factory_sap_sas.get_policy()

array([4, 1, 0, 4, 3, 2, 4, 0, 3, 3])

In [17]:
factory_sap_sas.play()

. => 9 None False False {}
3 => 6 -7 False False {}
4 => 4 0 False False {}
3 => 8 -3 False False {}
3 => 7 -2 False False {}
0 => 0 -2 False False {}
4 => 9 -1 True False {}


### Transition SAS, Reward SAS

In [18]:
factory_sas_sas = MDPFactory(MDPTransitionType.SAS, MDPRewardType.SAS, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sas_sas.terminate_s_flags)
print("\nRewards :\n", factory_sas_sas.rewards)
print("\nTransitions :\n", factory_sas_sas.transitions)

Terminate : [False False False False False False False False False  True]

Rewards :
 [[[-5 -2 -3 -2 -6 -3 -8 -8 -7  1]
  [ 0 -5 -7 -1 -6 -7 -5 -1 -2  1]
  [-4 -7 -5  0 -3 -5  0 -4  0  1]
  [-1 -1 -5 -7 -5 -3  0 -2 -3  1]
  [-8 -2  0 -2 -7 -6 -1 -3 -4  1]]

 [[-7  0 -6 -5 -4  0  0 -5 -3  1]
  [-4 -3 -3  0 -2 -5 -1 -2 -7  1]
  [-8 -8 -5 -4 -2 -5 -1 -4 -2  1]
  [-8 -1 -1 -2 -1 -4 -4 -1 -2  1]
  [-3 -2 -1 -5 -2 -6 -2 -2 -6  1]]

 [[ 0 -3 -5 -8 -1 -2 -2 -2 -6  1]
  [-7 -8 -1 -3 -2 -5 -2 -3 -3  1]
  [-2 -5 -8 -4 -7 -7 -6 -5 -3  1]
  [-3 -3  0  0 -2 -8 -1 -4 -5  1]
  [-8 -1 -2 -6 -2  0 -1 -7 -2  1]]

 [[-2 -6 -1  0 -6 -1 -5 -8 -1  1]
  [-4  0 -8 -7  0 -6 -7 -3 -7  1]
  [-2 -4 -8 -3 -7 -8 -4 -4  0  1]
  [-3 -8 -3 -3 -5 -4 -1 -2 -1  1]
  [-4 -7 -6 -8 -3 -5 -4 -7 -6  1]]

 [[-7 -8 -3 -4 -7 -3 -5 -5 -1  1]
  [-6 -6 -7 -5 -8 -6 -2 -7 -5  1]
  [-5 -6 -3 -6 -8  0 -2 -8 -4  1]
  [ 0 -7 -6 -3 -3 -7 -5 -8 -5  1]
  [-7 -7 -4  0 -5 -6 -2 -8 -6  1]]

 [[-6 -8 -1 -1 -7 -6 -6 -6 -6  1]
  [-6 -3 -8 -8 -3 -4

In [19]:
factory_sas_sas.play()

. => 0 None False False {}
0 => 7 -5 False False {}
0 => 3 -4 False False {}
0 => 8 0 False False {}
0 => 5 -3 False False {}
0 => 0 -6 False False {}
0 => 8 -5 False False {}
0 => 6 -3 False False {}
0 => 9 -3 True False {}


In [20]:
factory_sas_sas.train_policy()
factory_sas_sas.get_policy()

array([1, 0, 3, 0, 3, 3, 0, 4, 2, 2])

In [21]:
factory_sas_sas.play()

. => 9 None False False {}
2 => 7 -4 False False {}
4 => 3 -2 False False {}
0 => 8 0 False False {}
2 => 7 0 False False {}
4 => 0 -2 False False {}
1 => 9 0 True False {}


### Transition SAS, Reward SASR

In [24]:
factory_sas_sasr = MDPFactory(MDPTransitionType.SAS, MDPRewardType.SASR, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sas_sasr.terminate_s_flags)
print("\nRewards :\n", factory_sas_sasr.rewards)
print("\nTransitions :\n", factory_sas_sasr.transitions)

Terminate : [False False False False False False False False False  True]

Rewards :
 [[[[0.         0.06521739 0.02173913 ... 0.15217391 0.10869565
    0.        ]
   [0.19047619 0.21428571 0.16666667 ... 0.02380952 0.07142857
    0.        ]
   [0.24324324 0.05405405 0.13513514 ... 0.08108108 0.13513514
    0.        ]
   ...
   [0.14583333 0.14583333 0.14583333 ... 0.1875     0.14583333
    0.        ]
   [0.09375    0.0625     0.0625     ... 0.1875     0.03125
    0.        ]
   [0.         0.         0.         ... 0.         0.
    1.        ]]

  [[0.0952381  0.0952381  0.07142857 ... 0.19047619 0.07142857
    0.        ]
   [0.18367347 0.08163265 0.16326531 ... 0.14285714 0.04081633
    0.        ]
   [0.12765957 0.0212766  0.17021277 ... 0.04255319 0.17021277
    0.        ]
   ...
   [0.         0.04       0.12       ... 0.12       0.24
    0.        ]
   [0.11538462 0.17307692 0.07692308 ... 0.09615385 0.15384615
    0.        ]
   [0.         0.         0.         ... 0.   

In [25]:
factory_sas_sasr.play()

. => 0 None False False {}
0 => 7 -5 False False {}
0 => 7 -3 False False {}
0 => 0 0 False False {}
0 => 7 -1 False False {}
0 => 1 -5 False False {}
0 => 5 0 False False {}
0 => 4 -1 False False {}
0 => 3 -6 False False {}
0 => 5 -7 False False {}
0 => 6 -3 False False {}
0 => 8 -5 False False {}
0 => 9 1 True False {}


In [26]:
factory_sas_sasr.train_policy()
factory_sas_sasr.get_policy()

array([3, 1, 3, 4, 0, 4, 2, 4, 0, 0])

In [27]:
factory_sas_sasr.play()

. => 9 None False False {}
0 => 7 -5 False False {}
4 => 8 -2 False False {}
0 => 2 0 False False {}
3 => 7 -5 False False {}
4 => 0 -4 False False {}
3 => 3 0 False False {}
4 => 7 -2 False False {}
4 => 3 -6 False False {}
4 => 7 -8 False False {}
4 => 7 -3 False False {}
4 => 7 -5 False False {}
4 => 9 1 True False {}
