In [1]:
import gymnasium as gym
import assembly_game

game = gym.make("Min2Game")

game.reset()

(array([1, 2, 0, 0, 2, 1, 0, 0]), {})

Let's first move the value from %rdi to the return value (%rax), as you can see we immediately get reward of 10 because in one training example result is in the correct spot \
we are getting however the penalty of 1 for every timestep.

In [2]:
from assembly_game.processor import PROCESSOR_ACTIONS

list(enumerate(PROCESSOR_ACTIONS))

[(0, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RDI: 0>)),
 (1, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RSI: 1>)),
 (2, (<Instruction.MOV: 0>, <Operand.RDI: 0>, <Operand.RAX: 2>)),
 (3, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RDI: 0>)),
 (4, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RSI: 1>)),
 (5, (<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RAX: 2>)),
 (6, (<Instruction.MOV: 0>, <Operand.RAX: 2>, <Operand.RDI: 0>)),
 (7, (<Instruction.MOV: 0>, <Operand.RAX: 2>, <Operand.RSI: 1>)),
 (8, (<Instruction.MOV: 0>, <Operand.RAX: 2>, <Operand.RAX: 2>)),
 (9, (<Instruction.CMP: 1>, <Operand.RDI: 0>, <Operand.RDI: 0>)),
 (10, (<Instruction.CMP: 1>, <Operand.RDI: 0>, <Operand.RSI: 1>)),
 (11, (<Instruction.CMP: 1>, <Operand.RDI: 0>, <Operand.RAX: 2>)),
 (12, (<Instruction.CMP: 1>, <Operand.RSI: 1>, <Operand.RDI: 0>)),
 (13, (<Instruction.CMP: 1>, <Operand.RSI: 1>, <Operand.RSI: 1>)),
 (14, (<Instruction.CMP: 1>, <Operand.RSI: 1>, <Operand.RAX: 2>)),
 (15,

In [3]:
game.step(2) # MOV %rdi, %rax

(array([1, 2, 1, 0, 2, 1, 2, 0]),
 1,
 False,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 cmp_res=0',
  'example_1': 'rdi=2 rsi=1 rax=2 cmp_res=0'})

Let's now compare the value with the value in %rsi, if $rax happens to be greater then it must be the case that the value %rsi is minimum

In [4]:
game.step(14) # CMP $rsi, %rax

(array([ 1,  2,  1, -1,  2,  1,  2,  1]),
 0,
 False,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 cmp_res=-1',
  'example_1': 'rdi=2 rsi=1 rax=2 cmp_res=1'})

After conditional move all testing examples are "solved", as we are getting reward of 20 and penalty of 3 instructions used

In [5]:
game.step(23) # CMOVG %rsi, %rax

(array([ 1,  2,  1, -1,  2,  1,  1,  1]),
 11,
 False,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 cmp_res=-1',
  'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=1'})

Lastly we need to run the RET instructions to observe value of terminated=True

In [6]:
game.step(27) # RET

(array([ 1,  2,  1, -1,  2,  1,  1,  1]),
 10,
 True,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=1 cmp_res=-1',
  'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=1'})

# Now let's see how to add timelimit to the environment

In [7]:
game = gym.make("Min2Game", max_episode_steps=4)

In [8]:
game.reset()
DUMMY = (0)
game.step(DUMMY)

(array([1, 2, 0, 0, 2, 1, 0, 0]),
 0,
 False,
 False,
 {'example_0': 'rdi=1 rsi=2 rax=0 cmp_res=0',
  'example_1': 'rdi=2 rsi=1 rax=0 cmp_res=0'})

After 4 steps the truncated is being set to True, indicating that the episode has ended due to the time limit.

In [9]:
game.step(DUMMY)
game.step(DUMMY)
game.step(DUMMY)

(array([1, 2, 0, 0, 2, 1, 0, 0]),
 0,
 False,
 True,
 {'example_0': 'rdi=1 rsi=2 rax=0 cmp_res=0',
  'example_1': 'rdi=2 rsi=1 rax=0 cmp_res=0'})

Using a test Model

In [10]:
from stable_baselines3 import A2C


In [11]:
env = gym.make('Min2Game',    max_episode_steps=20)

In [12]:
MAX_STEPS = 100

In [13]:
model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

vec_env = model.get_env()
obs = vec_env.reset()


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 14.3      |
|    ep_rew_mean        | 0.657     |
| time/                 |           |
|    fps                | 145       |
|    iterations         | 100       |
|    time_elapsed       | 3         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -3.09     |
|    explained_variance | -2.39e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 0.0398    |
|    value_loss         | 0.000811  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 15       |
|    ep_rew_mean        | 0.788    |
| time/                 |          |
|    fps                | 172      |
|    iterations         | 200      |
|    time_elapsed       | 5        |
|    total_timesteps    | 1000     |
| train/             

In [14]:
state, _ = game.reset()
cumreward = 0
for i in range(MAX_STEPS):
  action,_ = model.predict(state)
  state, reward, terminated, truncated, info = game.step(action)
  cumreward +=reward
  print(PROCESSOR_ACTIONS[action], info, reward)
  if terminated:
    print(terminated)
    print(truncated)
    print(f"Episode finished after {i+1} timestamps")
    break
print(f"total reward {cumreward}")

(<Instruction.MOV: 0>, <Operand.RSI: 1>, <Operand.RAX: 2>) {'example_0': 'rdi=1 rsi=2 rax=2 cmp_res=0', 'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=0'} 1
(<Instruction.CMP: 1>, <Operand.RAX: 2>, <Operand.RSI: 1>) {'example_0': 'rdi=1 rsi=2 rax=2 cmp_res=0', 'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=0'} 0
(<Instruction.CMOVG: 2>, <Operand.RDI: 0>, <Operand.RAX: 2>) {'example_0': 'rdi=1 rsi=2 rax=2 cmp_res=0', 'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=0'} 0
(<Instruction.CMP: 1>, <Operand.RDI: 0>, <Operand.RAX: 2>) {'example_0': 'rdi=1 rsi=2 rax=2 cmp_res=1', 'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=-1'} 0
(<Instruction.CMOVG: 2>, <Operand.RDI: 0>, <Operand.RAX: 2>) {'example_0': 'rdi=1 rsi=2 rax=1 cmp_res=1', 'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=-1'} 11
(<Instruction.CMOVG: 2>, <Operand.RDI: 0>, <Operand.RAX: 2>) {'example_0': 'rdi=1 rsi=2 rax=1 cmp_res=1', 'example_1': 'rdi=2 rsi=1 rax=1 cmp_res=-1'} 10
(<Instruction.CMP: 1>, <Operand.RDI: 0>, <Operand.RAX: 2>) {'example_0': 'rdi=1 rsi=2 rax