In [1]:
from qiskit_gym.envs import LinearFunctionGym, CliffordGym, PermutationGym
from qiskit_gym.rl import RLSynthesis, PPOConfig, AlphaZeroConfig, BasicPolicyConfig

In [2]:
from qiskit import QuantumCircuit
from qiskit.transpiler import CouplingMap
import numpy as np

# Gym-style RL Environments for Quantum

In [3]:
cmap_3_line = CouplingMap.from_line(3, bidirectional=True)
env = LinearFunctionGym.from_coupling_map(cmap_3_line)
env.config["gateset"]

[('CX', (0, 1)),
 ('CX', (1, 0)),
 ('CX', (1, 2)),
 ('CX', (2, 1)),
 ('SWAP', (0, 1)),
 ('SWAP', (1, 0)),
 ('SWAP', (1, 2)),
 ('SWAP', (2, 1))]

In [4]:
env.difficulty = 1  # We can set env difficulty
env.reset()  # This resets the env and returns first observation

(array([[1, 1, 0],
        [0, 1, 0],
        [0, 0, 1]], dtype=int8),
 {})

In [5]:
qc = QuantumCircuit(3)
qc.cx(0,2)
qc.draw()

In [6]:
# You can set a custom state like this (in this case from a circuit)
env.set_state(env.get_state(qc))

In [7]:
env.render()  # This displays the current state

[[1 0 0]
 [0 1 0]
 [1 0 1]]


In [8]:
env.action_space  # This tells you the number of possible actions (a discrete space of 8 actions)

Discrete(8)

In [9]:
env.observation_space  # This tells you the type and size of observation space (N by N discrete in this case)

MultiBinary((3, 3))

In [10]:
# You can advance the env by providing an action
obs, reward, is_final, _, _ = env.step(2)

# This provides:
# - The observation of the state right after action
# - The reward for that step
# - If we are in a final state

obs, reward, is_final

(array([[1, 0, 0],
        [0, 1, 0],
        [1, 1, 1]], dtype=int8),
 -0.00390625,
 False)

In [11]:
# One way to do it
env.set_state(env.get_state(qc))
env.render()
for a in [4,2,4]:
    print(f"[{a}] - {env.config['gateset'][a]}")
    obs, reward, is_final, _, _ = env.step(a)
    print(f"[{a}] - Reward: {reward}, Is final: {is_final}")
    env.render()

[[1 0 0]
 [0 1 0]
 [1 0 1]]
[4] - ('SWAP', (0, 1))
[4] - Reward: -0.00390625, Is final: False
[[0 1 0]
 [1 0 0]
 [1 0 1]]
[2] - ('CX', (1, 2))
[2] - Reward: -0.00390625, Is final: False
[[0 1 0]
 [1 0 0]
 [0 0 1]]
[4] - ('SWAP', (0, 1))
[4] - Reward: 1.0, Is final: True
[[1 0 0]
 [0 1 0]
 [0 0 1]]


In [12]:
# Another way to do it

env.set_state(env.get_state(qc))
env.render()
for a in [0,2,0,2]:
    print(f"[{a}] - {env.config['gateset'][a]}")
    obs, reward, is_final, _, _ = env.step(a)
    print(f"[{a}] - Reward: {reward}, Is final: {is_final}")
    env.render()

[[1 0 0]
 [0 1 0]
 [1 0 1]]
[0] - ('CX', (0, 1))
[0] - Reward: -0.00390625, Is final: False
[[1 0 0]
 [1 1 0]
 [1 0 1]]
[2] - ('CX', (1, 2))
[2] - Reward: -0.00390625, Is final: False
[[1 0 0]
 [1 1 0]
 [0 1 1]]
[0] - ('CX', (0, 1))
[0] - Reward: -0.00390625, Is final: False
[[1 0 0]
 [0 1 0]
 [0 1 1]]
[2] - ('CX', (1, 2))
[2] - Reward: 1.0, Is final: True
[[1 0 0]
 [0 1 0]
 [0 0 1]]


# Training with TwisteRL

## Permutation

### Setup env

In [13]:
cmap_3x3 = CouplingMap.from_grid(3,3, bidirectional=False)
env = PermutationGym.from_coupling_map(cmap_3x3)
rls = RLSynthesis(env, PPOConfig(), BasicPolicyConfig())

rls.env.config["gateset"]

[('SWAP', (0, 1)),
 ('SWAP', (0, 3)),
 ('SWAP', (1, 2)),
 ('SWAP', (1, 4)),
 ('SWAP', (2, 5)),
 ('SWAP', (3, 4)),
 ('SWAP', (3, 6)),
 ('SWAP', (4, 5)),
 ('SWAP', (4, 7)),
 ('SWAP', (5, 8)),
 ('SWAP', (6, 7)),
 ('SWAP', (7, 8))]

### Train the model

In [14]:
rls.learn(num_iterations=10, tb_path="runs/perm_square_3x3/")

[32m2025-09-01 19:19:53.852[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m152[0m - [1m(1/0) {'successes': {'ppo_deterministic': 0.10000000149011612, 'ppo_10': 0.550000011920929}, 'rewards': {'ppo_deterministic': -0.357421875, 'ppo_10': 0.3193359375}, 'difficulty': 1, 'success': 0.10000000149011612, 'reward': -0.357421875} | {'to_rust': 0.008677438, 'eval_ppo_deterministic': 0.004478501, 'eval_ppo_10': 0.009866546, 'collect': 0.014593092, 'data_to_torch': 0.357087315, 'train': 0.354479625, 'total': 0.749832335}[0m
[32m2025-09-01 19:19:54.096[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m152[0m - [1m(1/1) {'successes': {'ppo_deterministic': 0.0, 'ppo_10': 0.550000011920929}, 'rewards': {'ppo_deterministic': -0.5078125, 'ppo_10': 0.3193359375}, 'difficulty': 1, 'success': 0.0, 'reward': -0.5078125} | {'to_rust': 0.00730489, 'eval_ppo_deterministic': 0.004515282, 'eval_ppo_10': 0.010901196, 'collect': 0.016545905, 'data_to_

### Save (or load) config and model

In [15]:
#rls.save("models/perm_square_3x3.json", "models/perm_square_3x3.pt")

rls = RLSynthesis.from_config_json("models/perm_square_3x3.json", "models/perm_square_3x3.pt")

### Try it

In [16]:
some_perm = np.random.permutation(9)

qc_perm = rls.synth(some_perm, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_perm.draw(fold=-1)

In [17]:
qc_perm_input = QuantumCircuit(9)
qc_perm_input.swap(0,8)
qc_perm_input.draw(fold=-1)

In [18]:
qc_perm_output = rls.synth(qc_perm_input, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_perm_output.draw(fold=-1)

## Linear Function

In [19]:
cmap_6_line = CouplingMap.from_line(5, bidirectional=True)
env = LinearFunctionGym.from_coupling_map(cmap_6_line, basis_gates=["CX"])

rls = RLSynthesis(env, PPOConfig(), BasicPolicyConfig())
rls.learn(num_iterations=10, tb_path="runs/lf_5_line_ppo/")  # This will track progress in Tensorboard

[32m2025-09-01 19:19:55.469[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m152[0m - [1m(1/0) {'successes': {'ppo_deterministic': 0.11999999731779099, 'ppo_10': 0.7599999904632568}, 'rewards': {'ppo_deterministic': -0.32734376192092896, 'ppo_10': 0.6351562738418579}, 'difficulty': 1, 'success': 0.11999999731779099, 'reward': -0.32734376192092896} | {'to_rust': 0.004224525, 'eval_ppo_deterministic': 0.003354417, 'eval_ppo_10': 0.010295717, 'collect': 0.013939636, 'data_to_torch': 0.011973222, 'train': 0.025935893, 'total': 0.070098516}[0m
[32m2025-09-01 19:19:55.474[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m160[0m - [1m(1/0) Improved, saved checkpoint![0m
[32m2025-09-01 19:19:55.557[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m152[0m - [1m(1/1) {'successes': {'ppo_deterministic': 0.20999999344348907, 'ppo_10': 0.800000011920929}, 'rewards': {'ppo_deterministic': -0.19199219346046448, 

In [20]:
#rls.save("models/lf_5_line.json", "models/lf_5_line.pt")

rls = RLSynthesis.from_config_json("models/lf_5_line.json", "models/lf_5_line.pt")

In [21]:
qc_lf_input = QuantumCircuit(5)
qc_lf_input.cx(0,4)
qc_lf_input.draw(fold=-1)

In [22]:
qc_lf_output = rls.synth(qc_lf_input, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_lf_output.draw(fold=-1)

In [23]:
from qiskit.circuit.library.generalized_gates import LinearFunction
LinearFunction(qc_lf_input) == LinearFunction(qc_lf_output)

np.True_

## Clifford

In [24]:
# Here we want to do Clifford synthesis but we only allow H and S to be placed on qubit 0
env = CliffordGym(
    num_qubits=3, 
    gateset=[
        ("CX", [0,1]),
        ("CX", [1,0]),
        ("CX", [1,2]),
        ("CX", [2,1]),
        ("SWAP", [0,1]),
        ("SWAP", [1,2]),
        ("H", [0]),
        ("S", [0]),
    ]
)
rls = RLSynthesis(env, PPOConfig(), BasicPolicyConfig())
rls.learn(num_iterations=10, tb_path="runs/clifford_3q_custom/")

[32m2025-09-01 19:19:56.971[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m152[0m - [1m(1/0) {'successes': {'ppo_deterministic': 0.12999999523162842, 'ppo_10': 0.699999988079071}, 'rewards': {'ppo_deterministic': -0.31230467557907104, 'ppo_10': 0.544921875}, 'difficulty': 1, 'success': 0.12999999523162842, 'reward': -0.31230467557907104} | {'to_rust': 0.004441711, 'eval_ppo_deterministic': 0.00365014, 'eval_ppo_10': 0.009906731, 'collect': 0.014079772, 'data_to_torch': 0.012533491, 'train': 0.025506229, 'total': 0.070566164}[0m
[32m2025-09-01 19:19:57.057[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m152[0m - [1m(1/1) {'successes': {'ppo_deterministic': 0.14000000059604645, 'ppo_10': 0.7300000190734863}, 'rewards': {'ppo_deterministic': -0.2972656190395355, 'ppo_10': 0.590039074420929}, 'difficulty': 1, 'success': 0.14000000059604645, 'reward': -0.2972656190395355} | {'to_rust': 0.004478244, 'eval_ppo_deterministic': 0.0

In [25]:
#rls.save("models/clifford_3q_custom.json", "models/clifford_3q_custom.pt")

rls = RLSynthesis.from_config_json("models/clifford_3q_custom.json", "models/clifford_3q_custom.pt")

In [26]:
from qiskit.quantum_info import random_clifford, Clifford

In [27]:
qc_clifford_in = QuantumCircuit(3)
qc_clifford_in.h(2)

qc_clifford_out = rls.synth(qc_clifford_in, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_clifford_out.draw(fold=-1)

In [28]:
some_clifford = random_clifford(3, seed=42)
qc_rand_clifford_out = rls.synth(some_clifford, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_rand_clifford_out.draw(fold=-1)

In [29]:
# Equivalent up to phase
np.array_equal(some_clifford.tableau[:,:-1], Clifford(qc_rand_clifford_out).tableau[:,:-1])

True