In [None]:
import numpy as np

from qiskit import(
  QuantumCircuit,
  execute,
  Aer)
import qutip as qt
import matplotlib.pyplot as plt

In [None]:
# Generate the target state

# Use Aer's qasm_simulator
simulator = Aer.get_backend('statevector_simulator')

# Create a Quantum Circuit acting on the q register
circuit = QuantumCircuit(2, 2)

# Add a H gate on qubit 0
circuit.h(0)
circuit.cx(0,1)
circuit.t(0)
circuit.h(1)



# Draw the circuit
circuit.draw()

# Execute the circuit on the qasm simulator
job = execute(circuit, simulator, shots=10)

# Grab results from the job
result = job.result()
out_state = result.get_statevector()
print(out_state)
circuit.draw()

In [None]:
class level2():
    
    def __init__(self):
        
        self.simulator = Aer.get_backend('statevector_simulator')
        
        self.max_circuit_length = 6
        self.action_space = 10
        self.observation_space = (1+self.action_space)**self.max_circuit_length
        
        self.target_state = np.array([0.85355339+0.35355339j, 0. +0.j, 0.14644661-0.35355339j,0.+0.j], dtype=np.complex_)
        
        self.target_state_vector = qt.Qobj(self.target_state)
    
    def step(self, action):
        done = False
        reward = 0

        self.current_circuit_length += 1
        
        if( self.current_circuit_length > self.max_circuit_length ):
            done = True
            reward = -1
            return self.state, reward, done, {'reason':'circuit too long'}

        
        # Update the state
        self.state[self.current_circuit_length-1] = action

        # If we get here, then we are still below max_circuit_length
        if( action == 0 ):
            self.circuit.x(0)
        if( action == 1 ):
            self.circuit.h(0)
        if( action == 2 ):
            self.circuit.z(0)
        if( action == 3 ):
            self.circuit.t(0)
        if( action == 4 ):
            self.circuit.cx(0,1)  
            
        if( action == 5 ):
            self.circuit.x(1)
        if( action == 6 ):
            self.circuit.h(1)
        if( action == 7 ):
            self.circuit.z(1)
        if( action == 8 ):
            self.circuit.t(1)        
        if( action == 9 ):
            self.circuit.cx(1,0)
            
        # Choose simulator backend, etc
        # Run the circuit
        job = execute(self.circuit, self.simulator, shots=1)
        result = job.result()
        current_quantum_state = result.get_statevector(0)
        
        self.tracedistance = qt.metrics.tracedist(qt.Qobj(current_quantum_state), (self.target_state_vector))
        
        reward = 1-self.tracedistance
        
        # Problem solved?
        if np.linalg.norm(current_quantum_state - self.target_state ) < 1e-5:
            done = True
#             reward = 1-self.tracedistance
            # TODO: Change reward such that shorter circuit gives higher reward
#             reward = self.max_circuit_length - self.current_circuit_length
        
#             return self.state, reward , done, {'reason':'solved!'}
        
#         if self.tracedistance > 1e-5 and self.tracedistance < 0.25:
#             reward = 3
#             return self.state, reward, done, {}
        
#         if self.tracedistance >= 0.25 and self.tracedistance < 0.5:
#             reward = 2
#             return self.state, reward, done, {}
        
#         if self.tracedistance >= 0.5 and self.tracedistance < 0.75:
#             reward = 1
#             return self.state, reward, done, {}        
        
        
        return self.state, reward, done, {}
        
    def reset(self):
        self.current_circuit_length = 0
        self.state = -np.ones(self.max_circuit_length) # [-1 -1 -1 -1 -1]

        # Create a Quantum Circuit acting on the q register
        self.circuit = QuantumCircuit(2, 2)
        return self.state

    def encode(self, state):
        # Each index can take 5 values (-1, 0, 1, 2, 3)
        number = 0
        for i,s in enumerate(state):
          number += (s+1)*5**i

        return int(number)

    def render(self):
        self.circuit.draw()
        return

    def close(self):
        return

In [None]:
env = level2()
initial_state = env.reset()

env.step(1)
env.step(4)
env.step(3)
env.step(6)



### Q-learning

In [None]:
env = level2()
state = env.reset() # reset environment to a new, random state

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

# np.random.seed(0)

# Hyperparameters
alpha = 0.8
gamma = 1
epsilon = 1
epsilon_decay = 0.9995

# For plotting metrics
all_epochs = []
all_rewards = []

# Q(s, a) = "Expected total reward to get, if in state 's' I take action 'a'"

q_table = np.zeros([env.observation_space, env.action_space])


for i in range(10000):
    state = env.reset()

    epochs, reward = 0, 0
    total_reward = 0
    done = False
    
    while not done:
        
        state_as_number = env.encode(state)

        if np.random.uniform() < epsilon:
            action = np.random.randint(env.action_space) # Explore action space
        
        else:
            action = np.argmax(q_table[state_as_number]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        total_reward += reward
                
        next_state_as_number = env.encode(next_state)
        
        old_value = q_table[state_as_number, action]
        next_max = np.max(q_table[next_state_as_number])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state_as_number, action] = new_value

        state = next_state
        epochs += 1

    all_rewards.append(total_reward)
    
    epsilon *= epsilon_decay
    if( epsilon < 0.01):
      epsilon = 0.01

    if i % 100 == 0:
        print(f"Episode: {i}")
        print("Current exploration rate: %.2f"%epsilon)
        
        # Check if it won
    if i == 2999:
        print(q_table)
        
print("Training finished.\n")

In [None]:
# for i in range(int(len(all_rewards)/100)):
#     all_rewards[i*100:(i+1)*100]

rewards = np.split(np.array(all_rewards), 100)

reward_mean = [ np.mean(r) for r in rewards ]

reward_std = [ np.std(r) for r in rewards ]

X = np.arange(100)
plt.plot(X, reward_mean, '.')
plt.errorbar(X, reward_mean,  reward_std, '.')

In [None]:
total_epochs, total_penalties = 0, 0
episodes = 1

wins = 0
for _ in range(episodes):
    state = env.reset()
    state_as_num = env.encode(state)
    epochs, reward = 0, 0
    
    done = False
    
    print("")

    print("Starting in state ")
    print(state, state_as_num)
    while not done:
        action = np.argmax(q_table[state_as_num])
        state, reward, done, info = env.step(action)
        state_as_num = env.encode(state)

        print("Took action: %d"%action)
        print("New state and num: ")
        print(state, state_as_num)
        
        epochs += 1

    if reward > 0:
        wins += 1

    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Wins: {wins}")
