In [1]:
import gymnasium as gym
import assembly_game

game = gym.make("Min2Game")

game.reset()

(array([1, 2, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0]), {})

Let's first move the value from %rdi to the return value (%rax), as you can see we immediately get reward of 10 because in one training example result is in the correct spot \
we are getting however the penalty of 1 for every timestep.

In [2]:
from assembly_game.processor import PROCESSOR_ACTIONS

list(enumerate(PROCESSOR_ACTIONS))

[(0, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RDI: 0>)),
 (1, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RSI: 1>)),
 (2, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RDX: 2>)),
 (3, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RCX: 3>)),
 (4, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RAX: 4>)),
 (5, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RDI: 0>)),
 (6, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RSI: 1>)),
 (7, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RDX: 2>)),
 (8, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RCX: 3>)),
 (9, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RAX: 4>)),
 (10, (<Instruction.MOV: 0>, <Operand.RDX: 2>, <Operand.RDI: 0>)),
 (11, (<Instruction.MOV: 0>, <Operand.RDX: 2>, <Operand.RSI: 1>)),
 (12, (<Instruction.MOV: 0>, <Operand.RDX: 2>, <Operand.RDX: 2>)),
 (13, (<Instruction.MOV: 0>, <Operand.RDX: 2>, <Operand.RCX: 3>)),
 (14, (<Instruction.MOV: 0>, <Operand.RDX: 2>, <Operand.RAX: 4>)),
 (15,

In [3]:
game.step(4) # MOV %rdi, %rax

(array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0, 0, 0]),
 1,
 False,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 rdx=0 rcx=0 cmp_res=0',
  'example_1': 'rdi=2 rsi=1 rax=2 rdx=0 rcx=0 cmp_res=0'})

Let's now compare the value with the value in %rsi, if $rax happens to be greater then it must be the case that the value %rsi is minimum

In [4]:
game.step(34) # CMP $rsi, %rax

(array([ 1,  2,  1,  0,  0, -1,  2,  1,  2,  0,  0,  1]),
 0,
 False,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 rdx=0 rcx=0 cmp_res=-1',
  'example_1': 'rdi=2 rsi=1 rax=2 rdx=0 rcx=0 cmp_res=1'})

After conditional move all testing examples are "solved", as we are getting reward of 20 and penalty of 3 instructions used

In [5]:
game.step(59) # CMOVG %rsi, %rax

(array([ 1,  2,  1,  0,  0, -1,  2,  1,  1,  0,  0,  1]),
 11,
 False,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 rdx=0 rcx=0 cmp_res=-1',
  'example_1': 'rdi=2 rsi=1 rax=1 rdx=0 rcx=0 cmp_res=1'})

Lastly we need to run the RET instructions to observe value of terminated=True

In [6]:
game.step(75) # RET

(array([ 1,  2,  1,  0,  0, -1,  2,  1,  1,  0,  0,  1]),
 10,
 True,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 rdx=0 rcx=0 cmp_res=-1',
  'example_1': 'rdi=2 rsi=1 rax=1 rdx=0 rcx=0 cmp_res=1'})

# Now let's see how to add timelimit to the environment

After 4 steps the truncated is being set to True, indicating that the episode has ended due to the time limit.

Using a test Model

In [7]:
from stable_baselines3 import PPO

In [8]:
MAX_STEPS = 20
env = gym.make("Min2Game", max_episode_steps=20)

In [9]:
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=50000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 17.4     |
|    ep_rew_mean     | 0.36     |
| time/              |          |
|    fps             | 658      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 17.9        |
|    ep_rew_mean          | 0.3         |
| time/                   |             |
|    fps                  | 544         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013471656 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.32       |
|    explained_variance   | -0.568      |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x7423e39f1100>

In [10]:
state, _ = env.reset()
cumreward = 0
for i in range(MAX_STEPS):
  action,_ = model.predict(state)
  state, reward, terminated, truncated, info = env.step(action)
  cumreward +=reward
  print(PROCESSOR_ACTIONS[action], info, reward)
  if terminated or truncated:
    print(terminated)
    print(truncated)
    print(f"Episode finished after {i+1} timestamps")
    break
print(f"total reward {cumreward}")

(<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RCX: 3>) {'example_0': 'rdi=1 rsi=2 rax=0 rdx=0 rcx=1 cmp_res=0', 'example_1': 'rdi=2 rsi=1 rax=0 rdx=0 rcx=2 cmp_res=0'} 0
(<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RAX: 4>) {'example_0': 'rdi=1 rsi=2 rax=1 rdx=0 rcx=1 cmp_res=0', 'example_1': 'rdi=2 rsi=1 rax=2 rdx=0 rcx=2 cmp_res=0'} 1
(<Instruction.MOV: 0>, <Operand.RCX: 3>, <Operand.RDX: 2>) {'example_0': 'rdi=1 rsi=2 rax=1 rdx=1 rcx=1 cmp_res=0', 'example_1': 'rdi=2 rsi=1 rax=2 rdx=2 rcx=2 cmp_res=0'} 0
(<Instruction.CMOVG: 2>, <Operand.RDX: 2>, <Operand.RSI: 1>) {'example_0': 'rdi=1 rsi=2 rax=1 rdx=1 rcx=1 cmp_res=0', 'example_1': 'rdi=2 rsi=1 rax=2 rdx=2 rcx=2 cmp_res=0'} 0
(<Instruction.CMP: 1>, <Operand.RSI: 1>, <Operand.RDI: 0>) {'example_0': 'rdi=1 rsi=2 rax=1 rdx=1 rcx=1 cmp_res=-1', 'example_1': 'rdi=2 rsi=1 rax=2 rdx=2 rcx=2 cmp_res=1'} 0
(<Instruction.CMOVG: 2>, <Operand.RSI: 1>, <Operand.RAX: 4>) {'example_0': 'rdi=1 rsi=2 rax=1 rdx=1 rcx=1 cmp_res=-1', 'example