# Bit flipping game with DQN solver

This is the implementation of the DQN solver for the bit flipping game in [**Hindsight Experience Replay**](https://arxiv.org/abs/1707.01495).

**Rerefence**:

1. Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, Wojciech Zaremba, Hindsight Experience Replay


In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from bitflipping import bitflipping as bf
from DQN import DQN

plt.rcParams['figure.figsize'] = [15, 20]
%matplotlib inline

## Set up the bit flipping game environment

In [2]:
init_state = np.array([0,1])
goal = np.ones((2,))
n = 4
bf_env = bf(n)

## Build up the DQN neural network

In [3]:
tf.reset_default_graph()


x = tf.placeholder(tf.float32, shape=(None, 2*n))
y = tf.placeholder(tf.float32, shape=(None, 1))


hid = [256]
agent = DQN(x, hid, n, discount=0.98, eps_0=1, eps_t=0.02, eps_timesteps=800, tau = 0.95, replay_buffer_size=1e5, batch_size=128)

In [4]:
losses, success_all = agent.train_Q(x, y, timesteps=5000, iteration=32, learning_start=200, learning_freq=20, update_freq=100)

Timestep 220: loss is 0.0314
Timestep 240: loss is 0.0291
Timestep 260: loss is 0.0171
Timestep 280: loss is 0.0267
Timestep 300: loss is 0.0219
Timestep 320: loss is 0.0159
Timestep 340: loss is 0.0232
Timestep 360: loss is 0.0158
Timestep 380: loss is 0.0184
Timestep 400: loss is 0.0203
Timestep 420: loss is 0.0175
Timestep 440: loss is 0.0245
Timestep 460: loss is 0.0265
Timestep 480: loss is 0.0159
Timestep 500: loss is 0.00947
Timestep 520: loss is 0.0141
Timestep 540: loss is 0.0225
Timestep 560: loss is 0.0309
Timestep 580: loss is 0.0226
Timestep 600: loss is 0.0149
Timestep 620: loss is 0.0179
Timestep 640: loss is 0.0237
Timestep 660: loss is 0.0237
Timestep 680: loss is 0.0142
Timestep 700: loss is 0.0148
Timestep 720: loss is 0.0181
Timestep 740: loss is 0.00897
Timestep 760: loss is 0.012
Timestep 780: loss is 0.0308
Timestep 800: loss is 0.0211
Timestep 820: loss is 0.0258
Timestep 840: loss is 0.0164
Timestep 860: loss is 0.0219
Timestep 880: loss is 0.029
Timestep 900: 

KeyboardInterrupt: 

In [None]:
plt.figure()
plt.plot(losses)
plt.show()

## Test DQN

In [None]:
with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, '/tmp/model.ckpt')
    
    success = 0
    for i in range(100):
        
        bf_env.reset()

        for i in range(n):
            X = np.concatenate((bf_env.state.reshape((1,-1)),bf_env.goal.reshape((1,-1))), axis=1)
            Q = sess.run(agent.targetModel, feed_dict={x: X})
            action = np.argmax(Q)
            bf_env.update_state(action)
            if (bf_env.reward(bf_env.state)==0):
                print('Success! state:{0}\t Goal state:{1}'.format(bf_env.state, bf_env.goal))
                success += 1
                break
            elif (i==n-1):
                print('Fail! state:{0}\t Goal state:{1}'.format(bf_env.state, bf_env.goal))
                
    print('Success rate {}%'.format(success))

In [None]:
a=np.array([[1,2,3,2,1,3]])

In [None]:
a.shape

In [None]:
s=np.argmax(a)