In [1]:
import numpy as np
import utils
np.set_printoptions(linewidth=np.inf)

In [2]:
def get_pv(idx):
    idx = idx - 1
    v = idx // 500
    p = idx % 500
    return p, v


In [23]:
data = utils.read_data('medium')
states = data[:, 0]
actions = data[:, 1]
rewards = data[:, 2]

unique_rewards = np.unique(rewards)
reward_to_idx = {reward: idx for idx, reward in enumerate(unique_rewards)}
idx_to_reward = {idx: reward for reward, idx in reward_to_idx.items()}
reward_idx = np.array([reward_to_idx[r] for r in rewards])

next_states = data[:, 3]
states_pv = np.array([get_pv(s) for s in states])
next_states_pv = np.array([get_pv(s) for s in next_states])
actions = actions.reshape(-1, 1)       # Shape (N, 1)
rewards = rewards.reshape(-1, 1)       # Shape (N, 1)
data_ = np.hstack([states_pv, actions, next_states_pv, rewards])
max_reward = data_[data_[:, -1] == 100_000]
col_max = np.max(data_, axis=0)
achieved = data_[data_[:, 3] >= 466]
achieved = achieved[achieved[:, -1].argsort()]

In [29]:
q = [.25, .5, .75, .95, .99]
print(f"Observed state space quantiles, Q = {q}\n")
for i in range(1, 8):
    print(f"Action = {i}")
    bla = data_[data_[:, 2] == i]
    print(f"Number of observations: {bla.shape[0]}")
    p_quants = np.quantile(bla[:, 0], q)
    v_quants = np.quantile(bla[:, 1], q)
    print(f"P Quantiles: {list(p_quants)}")
    print(f"V Quantiles: {list(v_quants)}\n")


Observed state space quantiles, Q = [0.25, 0.5, 0.75, 0.95, 0.99]

Action = 1
Number of observations: 11614
P Quantiles: [164.0, 196.0, 221.0, 329.35000000000036, 369.0]
V Quantiles: [38.0, 45.0, 49.0, 57.0, 62.0]

Action = 2
Number of observations: 9080
P Quantiles: [178.0, 203.0, 226.0, 314.0, 399.2099999999991]
V Quantiles: [39.0, 46.0, 51.0, 58.0, 63.0]

Action = 3
Number of observations: 21760
P Quantiles: [159.0, 198.0, 237.0, 343.0499999999993, 415.0]
V Quantiles: [36.0, 46.0, 52.0, 72.0, 86.0]

Action = 4
Number of observations: 13003
P Quantiles: [174.0, 205.0, 238.0, 362.0, 429.0]
V Quantiles: [43.0, 50.0, 57.0, 77.0, 88.0]

Action = 5
Number of observations: 22742
P Quantiles: [167.0, 208.0, 263.0, 397.0, 451.0]
V Quantiles: [48.0, 56.0, 67.0, 86.0, 93.0]

Action = 6
Number of observations: 9394
P Quantiles: [183.0, 206.0, 232.0, 354.0, 430.0]
V Quantiles: [48.0, 53.0, 60.0, 80.0, 89.0]

Action = 7
Number of observations: 12407
P Quantiles: [173.0, 207.0, 257.0, 377.0, 443.0

In [4]:
col_max

array([4.65e+02, 9.80e+01, 7.00e+00, 4.72e+02, 9.80e+01, 1.00e+05])

In [5]:
print(max_reward)

[[4.64e+02 6.30e+01 4.00e+00 4.68e+02 6.20e+01 1.00e+05]
 [4.63e+02 6.00e+01 4.00e+00 4.66e+02 5.90e+01 1.00e+05]
 [4.62e+02 6.30e+01 4.00e+00 4.66e+02 6.20e+01 1.00e+05]
 [4.64e+02 5.50e+01 4.00e+00 4.66e+02 5.50e+01 1.00e+05]
 [4.65e+02 5.90e+01 4.00e+00 4.68e+02 5.80e+01 1.00e+05]
 [4.65e+02 6.30e+01 4.00e+00 4.70e+02 6.30e+01 1.00e+05]
 [4.60e+02 6.60e+01 4.00e+00 4.66e+02 6.60e+01 1.00e+05]]


In [6]:
approx_max_reward  = data_[data_[:, -1] > 99_000]
approx_max_reward = approx_max_reward[approx_max_reward[:, 0].argsort()]
approx_max_reward

array([[4.5600e+02, 8.6000e+01, 7.0000e+00, 4.6900e+02, 8.6000e+01, 9.9775e+04],
       [4.5600e+02, 8.6000e+01, 7.0000e+00, 4.6900e+02, 8.6000e+01, 9.9775e+04],
       [4.5600e+02, 8.6000e+01, 7.0000e+00, 4.6900e+02, 8.6000e+01, 9.9775e+04],
       ...,
       [4.6500e+02, 5.5000e+01, 5.0000e+00, 4.6700e+02, 5.5000e+01, 9.9975e+04],
       [4.6500e+02, 6.4000e+01, 5.0000e+00, 4.7000e+02, 6.4000e+01, 9.9975e+04],
       [4.6500e+02, 5.9000e+01, 4.0000e+00, 4.6800e+02, 5.8000e+01, 1.0000e+05]])

In [7]:
approx_max_reward = approx_max_reward[approx_max_reward[:, 3].argsort()]
approx_max_reward # ok, so 466 is achieved

array([[4.6100e+02, 6.2000e+01, 5.0000e+00, 4.6500e+02, 6.2000e+01, 9.9975e+04],
       [4.6400e+02, 5.4000e+01, 5.0000e+00, 4.6500e+02, 5.4000e+01, 9.9975e+04],
       [4.6200e+02, 5.9000e+01, 3.0000e+00, 4.6500e+02, 5.8000e+01, 9.9975e+04],
       ...,
       [4.6400e+02, 7.3000e+01, 5.0000e+00, 4.7200e+02, 7.3000e+01, 9.9975e+04],
       [4.6400e+02, 7.3000e+01, 5.0000e+00, 4.7200e+02, 7.3000e+01, 9.9975e+04],
       [4.6400e+02, 7.3000e+01, 5.0000e+00, 4.7200e+02, 7.3000e+01, 9.9975e+04]])

In [8]:
print(achieved.shape)
print(achieved)

(186, 6)
[[4.5600e+02 8.6000e+01 7.0000e+00 4.6900e+02 8.6000e+01 9.9775e+04]
 [4.5600e+02 8.6000e+01 7.0000e+00 4.6900e+02 8.6000e+01 9.9775e+04]
 [4.5600e+02 8.6000e+01 7.0000e+00 4.6900e+02 8.6000e+01 9.9775e+04]
 ...
 [4.6300e+02 6.0000e+01 4.0000e+00 4.6600e+02 5.9000e+01 1.0000e+05]
 [4.6200e+02 6.3000e+01 4.0000e+00 4.6600e+02 6.2000e+01 1.0000e+05]
 [4.6500e+02 5.9000e+01 4.0000e+00 4.6800e+02 5.8000e+01 1.0000e+05]]


In [9]:
sa = data_[data_[:, 0] >= 460]

In [10]:
sorted_array = sa[sa[:, -1].argsort()]

In [11]:
sorted_array

array([[ 4.6200e+02,  5.8000e+01,  6.0000e+00,  4.6500e+02,  5.9000e+01, -1.0000e+02],
       [ 4.6200e+02,  5.0000e+01,  6.0000e+00,  4.6200e+02,  5.1000e+01, -1.0000e+02],
       [ 4.6100e+02,  6.0000e+01,  6.0000e+00,  4.6500e+02,  6.1000e+01, -1.0000e+02],
       [ 4.6000e+02,  4.8000e+01,  2.0000e+00,  4.6000e+02,  4.7000e+01, -1.0000e+02],
       [ 4.6000e+02,  6.2000e+01,  3.0000e+00,  4.6400e+02,  6.1000e+01, -2.5000e+01],
       [ 4.6000e+02,  5.5000e+01,  5.0000e+00,  4.6200e+02,  5.5000e+01, -2.5000e+01],
       [ 4.6200e+02,  5.5000e+01,  5.0000e+00,  4.6400e+02,  5.4000e+01, -2.5000e+01],
       [ 4.6000e+02,  6.0000e+01,  5.0000e+00,  4.6400e+02,  6.0000e+01, -2.5000e+01],
       [ 4.6000e+02,  5.2000e+01,  3.0000e+00,  4.6000e+02,  5.1000e+01, -2.5000e+01],
       [ 4.6200e+02,  5.1000e+01,  5.0000e+00,  4.6200e+02,  5.0000e+01, -2.5000e+01],
       [ 4.6200e+02,  5.0000e+01,  3.0000e+00,  4.6200e+02,  5.0000e+01, -2.5000e+01],
       [ 4.6200e+02,  5.0000e+01,  3.0000e+

In [12]:
sa.shape

(150, 6)

In [13]:
np.unique(sa[:, -1])

array([-1.0000e+02, -2.5000e+01,  0.0000e+00,  9.9900e+04,  9.9975e+04,  1.0000e+05])

In [None]:
data = utils.read_data('medium')
states = data[:, 0]
actions = data[:, 1]
rewards = data[:, 2]

next_states = data[:, 3]
states_pv = np.array([get_pv(s) for s in states])
next_states_pv = np.array([get_pv(s) for s in next_states])
actions = actions.reshape(-1, 1)     
rewards = rewards.reshape(-1, 1)  

data_ = np.hstack([states_pv, actions, next_states_pv, rewards])

# next add self absorbing states, from EDA, we know that 
# big reward is obtained at p = 466
p_values = np.arange(466, 500)          
v_values = np.arange(100)               
actions = np.arange(1, 8)                

p_grid, v_grid, action_grid = np.meshgrid(p_values, v_values, actions, indexing='ij')

p_flat = p_grid.ravel()
v_flat = v_grid.ravel()
action_flat = action_grid.ravel()

# Create the new rows by stacking the columns [p, v, action, p, v, 0]
new_rows = np.column_stack((p_flat, v_flat, action_flat, p_flat, v_flat, np.zeros_like(p_flat)))

data_ = np.vstack([data_, new_rows])

states_pv = data_[:, [0,1]]
actions = data_[:, 2]
next_states_pv = data_[:, [3,4]]
rewards = data_[:, -1]
unique_rewards = np.unique(rewards)
reward_to_idx = {reward: idx for idx, reward in enumerate(unique_rewards)}
idx_to_reward = {idx: reward for reward, idx in reward_to_idx.items()}
reward_idx = np.array([reward_to_idx[r] for r in rewards])

In [15]:
data_

array([[ 200.,   49.,    3.,  200.,   49.,  -25.],
       [ 200.,   49.,    2.,  200.,   48., -100.],
       [ 200.,   48.,    3.,  199.,   48.,  -25.],
       ...,
       [ 499.,   99.,    5.,  499.,   99.,    0.],
       [ 499.,   99.,    6.,  499.,   99.,    0.],
       [ 499.,   99.,    7.,  499.,   99.,    0.]])

In [16]:
data.shape

(100000, 4)

In [17]:
data_.shape

(123800, 6)