## MDP Tests

### Imports

In [1]:
from mdp_factory import MDPFactory
from enums import MDPTransitionType, MDPRewardType

In [2]:
import numpy as np
import warnings

In [3]:
warnings.filterwarnings(action="ignore")

### Transition S Deterministic, Reward S

In [4]:
factory_sd_s = MDPFactory(MDPTransitionType.S_DETERMINISTIC, MDPRewardType.S, 10, 4, 5)

In [5]:
print("Terminate :", factory_sd_s.terminate_s_flags)
print("Transitions :", factory_sd_s.transitions)
print("Rewards :", factory_sd_s.rewards)

Terminate : [False False False  True False False  True False  True  True]
Transitions : [3 0 5 6 6 0 5 2 3 3]
Rewards : [-2 -1 -3  1 -3 -2  1 -2  1  1]


In [6]:
factory_sd_s.train_policy()
factory_sd_s.play()

. => 0 None False False {}
0 => 3 -2 True False {}


### Transition S Probabilistic, Reward S

In [7]:
factory_sp_s = MDPFactory(MDPTransitionType.S_PROBABILISTIC, MDPRewardType.S, 10, 5, 10)

In [8]:
print("Terminate :", factory_sp_s.terminate_s_flags)
print("Rewards :", factory_sp_s.rewards)
print("Transitions :\n", factory_sp_s.transitions)

Terminate : [False False False False False False False False False  True]
Rewards : [-3  0  0 -3 -5 -5 -4 -2  0  1]
Transitions :
 [[0.13793103 0.         0.17241379 0.06896552 0.06896552 0.10344828
  0.         0.24137931 0.         0.20689655]
 [0.19148936 0.06382979 0.06382979 0.0212766  0.17021277 0.06382979
  0.08510638 0.10638298 0.19148936 0.04255319]
 [0.04761905 0.02380952 0.19047619 0.21428571 0.07142857 0.19047619
  0.04761905 0.04761905 0.04761905 0.11904762]
 [0.02439024 0.09756098 0.14634146 0.09756098 0.2195122  0.04878049
  0.19512195 0.         0.14634146 0.02439024]
 [0.05454545 0.05454545 0.10909091 0.16363636 0.09090909 0.
  0.16363636 0.14545455 0.16363636 0.05454545]
 [0.28125    0.         0.         0.0625     0.09375    0.15625
  0.03125    0.1875     0.125      0.0625    ]
 [0.10416667 0.08333333 0.16666667 0.14583333 0.0625     0.0625
  0.14583333 0.0625     0.14583333 0.02083333]
 [0.14285714 0.12244898 0.02040816 0.06122449 0.08163265 0.08163265
  0.1632653

In [9]:
factory_sp_s.play()

. => 0 None False False {}
0 => 7 -3 False False {}
0 => 5 -2 False False {}
0 => 8 -5 False False {}
0 => 6 0 False False {}
0 => 0 -4 False False {}
0 => 9 -3 True False {}


### Transition SA Deterministic, Reward SA

In [10]:
factory_sad_sa = MDPFactory(MDPTransitionType.SA_DETERMINISTIC, MDPRewardType.SA, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sad_sa.terminate_s_flags)
print("\nRewards :\n", factory_sad_sa.rewards)
print("\nTransitions :\n", factory_sad_sa.transitions)

Terminate : [False False False False False False False False False  True]

Rewards :
 [[-2 -4  0 -1 -5]
 [-5 -2 -4 -2 -6]
 [-4 -6 -7 -3 -6]
 [-5 -1 -4 -6  1]
 [-7  0 -4 -3 -7]
 [-7 -5 -7  0 -7]
 [-8 -7 -4 -8  0]
 [ 0 -3  1 -3 -2]
 [ 0 -6 -1  0 -6]
 [ 0 -4 -5 -8 -6]]

Transitions :
 [[0 8 4 1 2]
 [2 1 2 1 8]
 [8 3 6 3 5]
 [4 4 6 7 9]
 [1 1 5 2 7]
 [1 3 3 6 7]
 [5 5 2 7 0]
 [4 7 9 4 2]
 [8 5 2 2 1]
 [4 3 0 5 2]]


In [11]:
factory_sad_sa.play()

. => 0 None False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {}
0 => 0 -2 False False {

In [12]:
factory_sad_sa.train_policy()
factory_sad_sa.get_policy()

array([1, 4, 0, 4, 1, 3, 4, 2, 0, 0])

In [13]:
factory_sad_sa.play()

. => 0 None False False {}
1 => 8 -4 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 False False {}
0 => 8 0 Fal

### Transition SA Probabilistic, Reward SAS

In [14]:
factory_sap_sas = MDPFactory(MDPTransitionType.SA_PROBABILISTIC, MDPRewardType.SAS, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sap_sas.terminate_s_flags)
print("\nRewards :\n", factory_sap_sas.rewards)
print("\nTransitions :\n", factory_sap_sas.transitions)

Terminate : [False False False False False False False False False  True]

Rewards :
 [[[-4 -7 -8 -4 -6 -1 -8 -7 -3  1]
  [-3 -2 -3 -1  0 -3 -1 -3 -8  1]
  [-1 -3  0 -2 -6 -2 -8 -3 -8  1]
  [-3 -6 -5 -6 -6  0 -1 -7  0  1]
  [-6  0 -5 -2 -6 -1 -7  0 -6  1]]

 [[-5 -5 -1 -3 -8  0 -4  0 -1  1]
  [-5 -1 -1 -3 -5 -4 -2 -4 -8  1]
  [-7  0 -1 -5 -7 -5 -6 -7 -5  1]
  [-8 -4 -2  0 -3 -1 -6 -6 -5  1]
  [ 0 -3 -5 -3 -6 -7 -8 -3 -2  1]]

 [[-6 -1 -5 -5 -2 -8 -5  0 -1  1]
  [-5 -1 -1 -3 -5 -7 -4 -7 -5  1]
  [-8 -8 -2 -8  0 -3 -7 -7 -4  1]
  [-6 -1 -1 -1 -8 -1 -4 -8 -6  1]
  [-8 -5 -4 -2 -1 -6 -1 -7 -5  1]]

 [[ 0 -6 -2 -8  0  0 -8 -3 -7  1]
  [ 0 -3 -4 -3 -6 -3 -4 -4 -5  1]
  [-7  0 -6 -4 -5 -4 -8 -2 -1  1]
  [-6 -7 -3 -6 -6 -6 -4 -5 -8  1]
  [-7 -1 -3 -1 -3 -7 -3 -1 -3  1]]

 [[-8 -5 -3 -1 -1 -4 -4 -5 -7  1]
  [-1 -2 -7  0 -1 -6 -7 -1 -4  1]
  [-2 -7 -6 -6 -6 -7  0 -3 -6  1]
  [-8  0 -7 -1 -4 -1 -2 -6 -5  1]
  [ 0 -3 -6 -3  0 -1 -5  0 -4  1]]

 [[-5 -7 -4 -8 -1 -3 -6 -6 -7  1]
  [-1 -6 -2 -7  0 -6

In [15]:
factory_sap_sas.play()

. => 0 None False False {}
0 => 7 -4 False False {}
0 => 4 0 False False {}
0 => 8 -1 False False {}
0 => 6 -8 False False {}
0 => 0 -1 False False {}
0 => 9 -4 True False {}


In [16]:
factory_sap_sas.train_policy()
factory_sap_sas.get_policy()

array([2, 2, 1, 4, 4, 2, 0, 0, 2, 3])

In [17]:
factory_sap_sas.play()

. => 9 None False False {}
2 => 8 -1 False False {}
2 => 3 -6 False False {}
4 => 8 -1 False False {}
2 => 6 -6 False False {}
0 => 0 -1 False False {}
2 => 9 -1 True False {}


### Transition SAS, Reward SAS

In [18]:
factory_sas_sas = MDPFactory(MDPTransitionType.SAS, MDPRewardType.SAS, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sas_sas.terminate_s_flags)
print("\nRewards :\n", factory_sas_sas.rewards)
print("\nTransitions :\n", factory_sas_sas.transitions)

Terminate : [False False False False False False False False False  True]

Rewards :
 [[[-1 -7 -7 -1 -2  0 -4 -6 -3  1]
  [-7 -6 -5  0 -4 -1 -3 -4 -1  1]
  [-8 -1 -2 -8 -8 -2 -2 -2 -3  1]
  [-8 -6 -2 -8 -4 -8 -1  0 -6  1]
  [-7 -7 -5  0  0 -7 -1 -1 -5  1]]

 [[ 0 -7 -5  0 -4 -8 -5 -4 -5  1]
  [ 0 -5 -3 -4 -8 -3 -3 -5 -4  1]
  [-6 -4 -3 -6 -5 -1  0 -1 -3  1]
  [-7 -8 -3 -7 -6  0 -6 -5  0  1]
  [-8 -7 -5 -2 -3 -1  0  0 -7  1]]

 [[-4 -2 -4 -1  0  0 -8 -5 -4  1]
  [-7 -8 -4 -7 -6 -3 -4 -6  0  1]
  [ 0 -6 -5 -2 -2 -6 -3 -5 -2  1]
  [-5  0 -4 -1 -5 -1 -3 -5 -8  1]
  [ 0 -7 -2 -2 -4 -1 -5 -4 -5  1]]

 [[-1 -2 -2 -7 -2 -3 -1 -1  0  1]
  [-3 -5 -6 -4 -6 -2 -3 -2 -5  1]
  [-8  0  0 -5 -3 -3 -3 -2 -8  1]
  [-2 -1 -2 -1 -5 -6 -7 -8  0  1]
  [-5 -7 -3 -1 -6 -6 -6 -5 -6  1]]

 [[-8 -7 -3 -8 -7 -4 -2  0 -3  1]
  [-7 -8 -2 -7 -2 -2 -5 -3 -4  1]
  [-4 -5 -2 -1 -7 -8 -7 -6 -5  1]
  [-5 -4 -1 -4 -1 -6 -4  0 -7  1]
  [-6 -5 -3 -5 -1 -1 -8 -3 -6  1]]

 [[-6 -3 -5 -1 -3 -4  0 -7  0  1]
  [-2 -5 -8 -1 -4 -3

In [19]:
factory_sas_sas.play()

. => 0 None False False {}
0 => 8 -1 False False {}
0 => 2 -1 False False {}
0 => 8 -4 False False {}
0 => 5 -1 False False {}
0 => 0 -4 False False {}
0 => 9 -1 True False {}


In [20]:
factory_sas_sas.train_policy()
factory_sas_sas.get_policy()

array([0, 2, 4, 3, 3, 3, 3, 0, 3, 2])

In [21]:
factory_sas_sas.play()

. => 9 None False False {}
0 => 8 -1 False False {}
3 => 5 0 False False {}
3 => 9 0 True False {}


### Transition SAS, Reward SASR

In [22]:
factory_sas_sasr = MDPFactory(MDPTransitionType.SAS, MDPRewardType.SASR, 10, 5, 10, p=0.01, eps=1e-6)
print("Terminate :", factory_sas_sasr.terminate_s_flags)
print("\nRewards :\n", factory_sas_sasr.rewards)
print("\nTransitions :\n", factory_sas_sasr.transitions)

Terminate : [False False  True False False False False False False False]

Rewards :
 [[[[0.24242424 0.03030303 0.         ... 0.03030303 0.21212121
    0.        ]
   [0.10204082 0.06122449 0.12244898 ... 0.04081633 0.08163265
    0.        ]
   [0.         0.         0.         ... 0.         0.
    1.        ]
   ...
   [0.08888889 0.02222222 0.08888889 ... 0.15555556 0.15555556
    0.        ]
   [0.16326531 0.02040816 0.08163265 ... 0.14285714 0.02040816
    0.        ]
   [0.11764706 0.10294118 0.13235294 ... 0.05882353 0.13235294
    0.        ]]

  [[0.0212766  0.17021277 0.14893617 ... 0.17021277 0.08510638
    0.        ]
   [0.17647059 0.1372549  0.17647059 ... 0.15686275 0.03921569
    0.        ]
   [0.         0.         0.         ... 0.         0.
    1.        ]
   ...
   [0.19148936 0.08510638 0.19148936 ... 0.10638298 0.14893617
    0.        ]
   [0.         0.11428571 0.05714286 ... 0.11428571 0.17142857
    0.        ]
   [0.03571429 0.125      0.08928571 ... 0.10

In [23]:
factory_sas_sasr.play()

. => 0 None False False {}
0 => 8 -5 False False {}
0 => 9 -2 False False {}
0 => 0 0 False False {}
0 => 8 -2 False False {}
0 => 0 -4 False False {}
0 => 3 0 False False {}
0 => 7 -1 False False {}
0 => 4 -7 False False {}
0 => 3 -8 False False {}
0 => 7 -4 False False {}
0 => 8 -6 False False {}
0 => 9 0 False False {}
0 => 8 -7 False False {}
0 => 5 -7 False False {}
0 => 1 -2 False False {}
0 => 7 -1 False False {}
0 => 4 -6 False False {}
0 => 3 -7 False False {}
0 => 0 -4 False False {}
0 => 3 -3 False False {}
0 => 6 0 False False {}
0 => 7 -7 False False {}
0 => 8 -1 False False {}
0 => 2 1 True False {}


In [24]:
factory_sas_sasr.train_policy()
factory_sas_sasr.get_policy()

array([1, 2, 0, 2, 3, 0, 2, 4, 2, 3])

In [25]:
factory_sas_sasr.play()

. => 2 None False False {}
1 => 8 -4 False False {}
2 => 6 -3 False False {}
2 => 0 0 False False {}
1 => 8 -1 False False {}
2 => 0 -4 False False {}
1 => 2 1 True False {}
