# 6. Deep Q-Learning

Input: the policy $\pi$ <br>
Initialize $Q(s,a)$ arbitrarily <br>
Repeat (for each episode): <br>
&emsp;    Initialize s <br>
&emsp;    Repeat (for each step of episode): <br>
&emsp;&emsp;        A <-- action given by $\pi$ for s <br>
&emsp;&emsp;        Taken action A; observe reward, R, and next state, S' <br>
&emsp;&emsp;        $Q(s,a)$ <-- $Q(s,a) + \alpha [R + \gamma  amax(Q(s',:)) - Q(s,a)]$ <br>
&emsp;&emsp;        S <-- S' <br>
&emsp;    until S is terminal <br>

## Tensorflow implementation for Q-Learning using a NN
For big state-action-spaces the required memory for Q can exceeed the available RAM.
In such cases the lookup table for Q can be approximated by some function e.g. a NN which maps the current state to a corresponding Q-value.
With a NN the values for Q can't just simply be updated, instead the algorithm has to figure out how to update the weights of the NN. This is done by using backpropagation using the loss sum(q_target - Q)^2

In [1]:
import gym
import numpy as np
import tensorflow as tf
from collections import deque
import time
np.random.seed(42)

#env_name = 'Taxi-v2'
env_name = 'FrozenLake-v0'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1000 

reward_list = deque(maxlen=100)

tf.reset_default_graph()
inputs1 = tf.placeholder(shape=[1,state_space], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([state_space, action_space], 0, 0.01))
Qout = tf.matmul(inputs1, W)
predict = tf.argmax(Qout, 1)

nextQ = tf.placeholder(shape=[1,action_space], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for game in range(num_games):
        state = env.reset()
        epsilon *= epsilon_decay
        done = False
        episode_reward = 0
        
        while(not done):
            action, target_Q = sess.run([predict, Qout], feed_dict={inputs1: np.identity(state_space)[state:state+1]})
            if np.random.rand(1) < epsilon:
                action[0] = env.action_space.sample()
            state_next, reward, done, _ = env.step(action[0])
            episode_reward += reward
            Q1 = sess.run(Qout, feed_dict={inputs1:np.identity(state_space)[state_next:state_next+1]})
            
            target_Q[0, action[0]] = reward + gamma*np.max(Q1)
            _, W1 = sess.run([updateModel, W], feed_dict={inputs1:np.identity(state_space)[state:state+1], nextQ:target_Q})
            state = state_next
            
            
            if(done):
                reward_list.append(episode_reward)
                if(game%50 == 0):
                    print('avg reward: ', np.mean(reward_list))
                break



  from ._conv import register_converters as _register_converters


avg reward:  0.0
avg reward:  0.0196078431372549
avg reward:  0.01
avg reward:  0.06
avg reward:  0.09
avg reward:  0.12
avg reward:  0.31
avg reward:  0.51
avg reward:  0.64
avg reward:  0.65
avg reward:  0.61
avg reward:  0.66
avg reward:  0.74
avg reward:  0.77
avg reward:  0.71
avg reward:  0.66
avg reward:  0.62
avg reward:  0.67
avg reward:  0.75
avg reward:  0.74


## Keras implementation for Q-Learning using NN
For some reason Keras behaves differently than Tensorflow even though Tensorflow is used as backend

In [42]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from collections import deque
from keras.initializers import RandomUniform
np.random.seed(42)

#env_name = 'Taxi-v2'
env_name = 'FrozenLake-v0'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.999 # exploration decay
num_games = 5000 

reward_list = deque(maxlen=100)

def create_model():
    model = Sequential()
    model.add(Dense(10, input_dim=state_space, activation='relu'))
    model.add(Dense(action_space, activation='linear'))
    model.compile(loss='mse',optimizer=SGD(lr=0.1))
    return model

model = create_model()

def state_to_Qvalue(state, model):
    Qvalue = model.predict(state)
    return Qvalue

for game in range(num_games):
    state = env.reset()
    state = np.identity(state_space)[state:state+1] # transforms state into 1-hot-encoding
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0

    while(not done):
        target_Q = state_to_Qvalue(state, model)
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(target_Q)

        state_next, reward, done, _ = env.step(action)
        state_next = np.identity(state_space)[state_next:state_next+1]
        episode_reward += reward
        Q1 = state_to_Qvalue(state_next, model)

        target_Q[0, action] = reward + gamma*np.max(Q1)
        model.train_on_batch(state, target_Q)
        #model.fit(state, target_Q, verbose=0, epochs=1)

        state = state_next
        if(done):
            reward_list.append(episode_reward)
            if(game%200 == 0):
                print('episode: ', game, 'avg reward: ', np.mean(reward_list))



episode:  0 avg reward:  0.0
episode:  200 avg reward:  0.0
episode:  400 avg reward:  0.01
episode:  600 avg reward:  0.0
episode:  800 avg reward:  0.02
episode:  1000 avg reward:  0.0
episode:  1200 avg reward:  0.02
episode:  1400 avg reward:  0.05
episode:  1600 avg reward:  0.06
episode:  1800 avg reward:  0.07
episode:  2000 avg reward:  0.05
episode:  2200 avg reward:  0.06
episode:  2400 avg reward:  0.04
episode:  2600 avg reward:  0.05
episode:  2800 avg reward:  0.04
episode:  3000 avg reward:  0.03
episode:  3200 avg reward:  0.01
episode:  3400 avg reward:  0.01
episode:  3600 avg reward:  0.02
episode:  3800 avg reward:  0.04
episode:  4000 avg reward:  0.02
episode:  4200 avg reward:  0.01
episode:  4400 avg reward:  0.06
episode:  4600 avg reward:  0.1
episode:  4800 avg reward:  0.02


# Adding a frozen target policy
In most cases training with Gradient descent instead of the simple tabular update rule, is more unstable and therefore less efficient. Two extensions to improve the efficiency are Experience Replay and Freezing target Networks. In the approach above where the same networks influence the target and the Q-prediction, an effect similar to a dog hunting its tail can occur. 
The weights of the network which decides the next action get updated to better fit the target but at the same time the target also moves. To avoid this problem, a seperate target network has fixed weights which only get updated every once in a while.

In [43]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from collections import deque
from keras.initializers import RandomUniform
np.random.seed(42)

#env_name = 'Taxi-v2'
env_name = 'FrozenLake-v0'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.999 # exploration decay
num_games = 5000 

reward_list = deque(maxlen=100)

def create_model():
    model = Sequential()
    model.add(Dense(10, input_dim=state_space, activation='relu'))
    model.add(Dense(action_space, activation='linear'))
    model.compile(loss='mse',optimizer=SGD(lr=0.1))
    return model

def target_model_update():
    target_model.set_weights(model.get_weights())

model = create_model()
target_model = create_model()

def state_to_Qvalue(state, model):
    Qvalue = model.predict(state)
    return Qvalue

for game in range(num_games):
    state = env.reset()
    state = np.identity(state_space)[state:state+1] # transforms state into 1-hot-encoding
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0

    while(not done):
        target_Q = state_to_Qvalue(state, model)
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(target_Q)

        state_next, reward, done, _ = env.step(action)
        state_next = np.identity(state_space)[state_next:state_next+1]
        episode_reward += reward
        Q1 = state_to_Qvalue(state_next, target_model)

        target_Q[0, action] = reward + gamma*np.max(Q1)
        #model.fit(state, target_Q, verbose=0, epochs=1)
        model.train_on_batch(state, target_Q)

        state = state_next
        if(done):
            target_model_update()
            reward_list.append(episode_reward)
            if(game%200 == 0):
                print('episode: ', game, 'avg reward: ', np.mean(reward_list))



episode:  0 avg reward:  0.0
episode:  200 avg reward:  0.02
episode:  400 avg reward:  0.04
episode:  600 avg reward:  0.01
episode:  800 avg reward:  0.01
episode:  1000 avg reward:  0.04
episode:  1200 avg reward:  0.01
episode:  1400 avg reward:  0.02
episode:  1600 avg reward:  0.07
episode:  1800 avg reward:  0.01
episode:  2000 avg reward:  0.03
episode:  2200 avg reward:  0.03
episode:  2400 avg reward:  0.05
episode:  2600 avg reward:  0.05
episode:  2800 avg reward:  0.02
episode:  3000 avg reward:  0.09
episode:  3200 avg reward:  0.09
episode:  3400 avg reward:  0.06
episode:  3600 avg reward:  0.08
episode:  3800 avg reward:  0.04
episode:  4000 avg reward:  0.09
episode:  4200 avg reward:  0.01
episode:  4400 avg reward:  0.0
episode:  4600 avg reward:  0.01
episode:  4800 avg reward:  0.02


# Adding memory replay
It is very inefficient to use a played sample only once, memory replay saves a certain number of (s, a, r, s') in a list and reuses samples of this memory for batch training. This can break temporal correlations and speed up training 

In [41]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from collections import deque
from keras.initializers import RandomUniform
import random
import time
np.random.seed(42)

#env_name = 'Taxi-v2'
env_name = 'FrozenLake-v0'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1000 
memory = deque(maxlen=100)
minibatch_size = 20

reward_list = deque(maxlen=100)

def replay():
    minibatch = random.sample(memory, minibatch_size)
    minibatch = np.array(minibatch)
    not_done_indices = np.where(minibatch[:, 4] == False)
    
    state = np.vstack(minibatch[:, 0])
    action = np.vstack(minibatch[:, 1])

    y = np.copy(minibatch[:, 2])
    
    # If minibatch contains any non-terminal states, use separate update rule for those states
    if len(not_done_indices[0]) > 0:
        predict_sprime = model.predict(np.vstack(minibatch[:, 3]))
        predict_sprime_target = target_model.predict(np.vstack(minibatch[:, 3]))

        y[not_done_indices] += np.multiply(gamma, \
                predict_sprime_target[not_done_indices, \
                np.argmax(predict_sprime[not_done_indices, :][0], axis=1)][0])

    target = model.predict(state)
    actions = np.array(minibatch[:, 1], dtype=int)
    target[range(minibatch_size), actions] = y
    model.fit(state, target, epochs=1, verbose=0)

def create_model():
    model = Sequential()
    model.add(Dense(10, input_dim=state_space, activation='relu'))
    model.add(Dense(action_space, activation='linear'))
    model.compile(loss='mse',optimizer=SGD(lr=0.1))
    return model

def target_model_update():
    target_model.set_weights(model.get_weights())

def add_memory(s, a, r, s_prime, done):
    memory.append((s, a, r, s_prime, done))
    
model = create_model()
target_model = create_model()

def state_to_Qvalue(state, model):
    Qvalue = model.predict(state)
    return Qvalue

for game in range(num_games):
    state = env.reset()
    state = np.identity(state_space)[state:state+1] # transforms state into 1-hot-encoding
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0

    while(not done):
        target_Q = state_to_Qvalue(state, model)
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(target_Q)

        state_next, reward, done, _ = env.step(action)
        state_next = np.identity(state_space)[state_next:state_next+1]
        add_memory(state, action, reward, state_next, done)
        episode_reward += reward
        
        if len(memory) > minibatch_size:
            replay()
        
        state = state_next
        if(done):
            target_model_update()
            reward_list.append(episode_reward)
            if(game%50 == 0):
                print('avg reward: ', np.mean(reward_list))

avg reward:  0.0
avg reward:  0.0196078431372549
avg reward:  0.05
avg reward:  0.11
avg reward:  0.13
avg reward:  0.25
avg reward:  0.37
avg reward:  0.4
avg reward:  0.38
avg reward:  0.49
avg reward:  0.61
avg reward:  0.58
avg reward:  0.56
avg reward:  0.53
avg reward:  0.55
avg reward:  0.53
avg reward:  0.57
avg reward:  0.64
avg reward:  0.66
avg reward:  0.64


In [54]:
state[0]

0.0027473787777125835

In [50]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from collections import deque
from keras.initializers import RandomUniform
import random
import time
np.random.seed(42)

#env_name = 'Taxi-v2'
env_name = 'BipedalWalker-v2'
episode_length=2000


env = gym.make(env_name)
state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1000 
memory = deque(maxlen=100)
minibatch_size = 20

reward_list = deque(maxlen=100)

def replay():
    minibatch = random.sample(memory, minibatch_size)
    minibatch = np.array(minibatch)
    not_done_indices = np.where(minibatch[:, 4] == False)
    
    state = np.vstack(minibatch[:, 0])
    action = np.vstack(minibatch[:, 1])

    y = np.copy(minibatch[:, 2])
    
    # If minibatch contains any non-terminal states, use separate update rule for those states
    if len(not_done_indices[0]) > 0:
        predict_sprime = model.predict(np.vstack(minibatch[:, 3]))
        predict_sprime_target = target_model.predict(np.vstack(minibatch[:, 3]))

        y[not_done_indices] += np.multiply(gamma, \
                predict_sprime_target[not_done_indices, \
                np.argmax(predict_sprime[not_done_indices, :][0], axis=1)][0])

    target = model.predict(state)
    actions = np.array(minibatch[:, 1], dtype=int)
    target[range(minibatch_size), actions] = y
    model.fit(state, target, epochs=1, verbose=0)

def create_model():
    model = Sequential()
    model.add(Dense(10, input_dim=state_space, activation='relu'))
    model.add(Dense(action_space, activation='linear'))
    model.compile(loss='mse',optimizer=SGD(lr=0.1))
    return model

def target_model_update():
    target_model.set_weights(model.get_weights())

def add_memory(s, a, r, s_prime, done):
    memory.append((s, a, r, s_prime, done))
    
model = create_model()
target_model = create_model()

def state_to_Qvalue(state, model):
    Qvalue = model.predict(state)
    return Qvalue

for game in range(num_games):
    state = env.reset()
    #state = np.identity(state_space)[state:state+1] # transforms state into 1-hot-encoding
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0

    for i in range(episode_length):
        target_Q = state_to_Qvalue(state, model)
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(target_Q)

        state_next, reward, done, _ = env.step(action)
        state_next = np.identity(state_space)[state_next:state_next+1]
        add_memory(state, action, reward, state_next, done)
        episode_reward += reward
        
        if len(memory) > minibatch_size:
            replay()
        
        state = state_next
        if(done):
            target_model_update()
            reward_list.append(episode_reward)
            if(game%50 == 0):
                print('avg reward: ', np.mean(reward_list))
            break

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


ValueError: Error when checking : expected dense_102_input to have shape (24,) but got array with shape (1,)