# 5. Q-Learning

In contrast to Sarsa which is an on-policy method (it learns to improve the policy while following it), Q-learning is off-policy (it improves Q independent of the policy being followed).
The algorithm is nearly identical to Sarsa, with the following difference: <br>
$Q(s,a)$ <-- $Q(s,a) + \alpha [R + \gamma  Q(s',a') - Q(s,a)]$ <br> 
$Q(s,a)$ <-- $Q(s,a) + \alpha [R + \gamma  amax(Q(s',:)) - Q(s,a)]$ <br> 
amax(Q(s',:)) chooses the best possible action value available at the next state.
In Sutton & Barto, Sarsa performs better an the cliff-walking task but in the "Taxi-v2" and "FrozenLake" environments Q-Learning outperforms Sarsa.

Input: the policy $\pi$ <br>
Initialize $Q(s,a)$ arbitrarily <br>
Repeat (for each episode): <br>
&emsp;    Initialize s <br>
&emsp;    Repeat (for each step of episode): <br>
&emsp;&emsp;        A <-- action given by $\pi$ for s <br>
&emsp;&emsp;        Taken action A; observe reward, R, and next state, S' <br>
&emsp;&emsp;        $Q(s,a)$ <-- $Q(s,a) + \alpha [R + \gamma  amax(Q(s',:)) - Q(s,a)]$ <br>
&emsp;&emsp;        S <-- S' <br>
&emsp;    until S is terminal <br>

## Example tabular Q-Learning

In [None]:
import gym
import numpy as np
from collections import deque

env_name = 'Taxi-v2'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

alpha = 0.85
gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1500 


q = np.zeros([state_space, action_space])
reward_list = deque(maxlen=100)

def choose_action(q, state, epsilon):
    ''' epsilon-greedy policy (explore with probability epsilon)  '''
    if(np.random.uniform() < epsilon):
        action = np.random.choice(action_space) # exploration
    else:
        action = np.argmax(q[state, :]) # exploitation
    return action

for game in range(num_games):
    state = env.reset()
    action = choose_action(q, state, epsilon)
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0
    
    while(not done): 
        state_next, reward, done, _ = env.step(action)
        episode_reward += reward
        action_next = choose_action(q, state_next, epsilon)
        q[state, action] = q[state, action] + alpha*( reward + gamma*(np.amax(q[state_next, :])) - q[state, action] )
        state = state_next
        action = action_next
        time2 = time.time()
        if(done):
            reward_list.append(episode_reward)
            if(game%100 == 0):
                print('avg reward: ', np.mean(reward_list))



## Q-Learning using a NN
For big state-action-spaces the required memory for Q can exceeed the available RAM.
In such cases the lookup table for Q can be approximated by some function e.g. a NN which maps the current state to a corresponding Q-value.
With a NN the values for Q can't just simply be updated, instead the algorithm has to figure out how to update the weights of the NN. This is done by using backpropagation using the loss sum(q_target - Q)^2

In [None]:
import gym
import numpy as np
import tensorflow as tf
from collections import deque
import time

env_name = 'Taxi-v2'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1500 

reward_list = deque(maxlen=100)

tf.reset_default_graph()
inputs1 = tf.placeholder(shape=[1,state_space], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([state_space, action_space], 0, 0.01))
Qout = tf.matmul(inputs1, W)
predict = tf.argmax(Qout, 1)

nextQ = tf.placeholder(shape=[1,action_space], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for game in range(num_games):
        state = env.reset()
        epsilon *= epsilon_decay
        done = False
        episode_reward = 0
        
        while(not done):
            action, target_Q = sess.run([predict, Qout], feed_dict={inputs1: np.identity(state_space)[state:state+1]})
            if np.random.rand(1) < epsilon:
                action[0] = env.action_space.sample()
            state_next, reward, done, _ = env.step(action[0])
            episode_reward += reward
            Q1 = sess.run(Qout, feed_dict={inputs1:np.identity(state_space)[state_next:state_next+1]})
            
            target_Q[0, action[0]] = reward + gamma*np.max(Q1)
            _, W1 = sess.run([updateModel, W], feed_dict={inputs1:np.identity(state_space)[state:state+1], nextQ:target_Q})
            state = state_next
            
            
            if(done):
                reward_list.append(episode_reward)
                if(game%50 == 0):
                    print('avg reward: ', np.mean(reward_list))
                break



In most cases training with Gradient descent instead of the simple tabular update rule, is more unstable and therefore less efficient.
Two extensions to improve the efficiency are Experience Replay and Freezing target Networks

## Keras implementation for Q-Learning using NN

In [14]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from collections import deque
from keras.initializers import RandomUniform

env_name = 'Taxi-v2'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1500 

reward_list = deque(maxlen=100)
init = RandomUniform(minval=0.0, maxval = 0.01)

def create_model():
    model = Sequential()
    #model.add(Dense(10, input_dim=state_space, activation='relu'))
    #model.add(Dense(action_space, activation='linear'))
    model.add(Dense(action_space, input_dim=state_space, activation='linear', kernel_initializer=init, use_bias=False))
    model.compile(loss='mse',optimizer=SGD(lr=0.1))
    return model

model = create_model()

def state_to_Qvalue(state, model):
    Qvalue = model.predict(state)
    return Qvalue

for game in range(num_games):
    state = env.reset()
    state = np.identity(state_space)[state:state+1] # transforms state into 1-hot-encoding
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0

    while(not done):
        target_Q = state_to_Qvalue(state, model)
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(target_Q)

        state_next, reward, done, _ = env.step(action)
        state_next = np.identity(state_space)[state_next:state_next+1]
        episode_reward += reward
        Q1 = state_to_Qvalue(state_next, model)

        target_Q[0, action] = reward + gamma*np.max(Q1)
        model.fit(state, target_Q, verbose=0, epochs=1)

        state = state_next
        if(done):
            reward_list.append(episode_reward)
            if(game%50 == 0):
                print('avg reward: ', np.mean(reward_list))



avg reward:  -821.0
avg reward:  -662.6470588235294
avg reward:  -559.14
avg reward:  -385.13
avg reward:  -288.81
avg reward:  -242.89
avg reward:  -203.66
avg reward:  -184.24
avg reward:  -168.93
avg reward:  -156.68
avg reward:  -149.23
avg reward:  -139.01
avg reward:  -138.23
avg reward:  -128.56
avg reward:  -125.64
avg reward:  -119.51
avg reward:  -105.7
avg reward:  -99.09
avg reward:  -97.78
avg reward:  -106.69
avg reward:  -86.45
avg reward:  -71.27
avg reward:  -77.04
avg reward:  -70.01
avg reward:  -60.96
avg reward:  -57.75
avg reward:  -52.83
avg reward:  -43.54
avg reward:  -40.29
avg reward:  -37.93


# Adding a frozen target policy

In [16]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from collections import deque
from keras.initializers import RandomUniform

env_name = 'Taxi-v2'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1500 

reward_list = deque(maxlen=100)
init = RandomUniform(minval=0.0, maxval = 0.01)

def create_model():
    model = Sequential()
    #model.add(Dense(10, input_dim=state_space, activation='relu'))
    #model.add(Dense(action_space, activation='linear'))
    model.add(Dense(action_space, input_dim=state_space, activation='linear', kernel_initializer=init))
    model.compile(loss='mse',optimizer=SGD(lr=0.1))
    return model

def target_model_update():
    target_model.set_weights(model.get_weights())

model = create_model()
target_model = create_model()

def state_to_Qvalue(state, model):
    Qvalue = model.predict(state)
    return Qvalue

for game in range(num_games):
    state = env.reset()
    state = np.identity(state_space)[state:state+1] # transforms state into 1-hot-encoding
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0

    while(not done):
        target_Q = state_to_Qvalue(state, model)
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(target_Q)

        state_next, reward, done, _ = env.step(action)
        state_next = np.identity(state_space)[state_next:state_next+1]
        episode_reward += reward
        Q1 = state_to_Qvalue(state_next, target_model)

        target_Q[0, action] = reward + gamma*np.max(Q1)
        model.fit(state, target_Q, verbose=0, epochs=1)

        state = state_next
        if(done):
            target_model_update()
            reward_list.append(episode_reward)
            if(game%50 == 0):
                print('avg reward: ', np.mean(reward_list))



avg reward:  -884.0
avg reward:  -650.5882352941177
avg reward:  -558.61
avg reward:  -416.74
avg reward:  -328.3
avg reward:  -272.69
avg reward:  -236.33
avg reward:  -208.78
avg reward:  -179.42
avg reward:  -161.08
avg reward:  -156.77
avg reward:  -154.71
avg reward:  -154.42
avg reward:  -156.79
avg reward:  -160.03
avg reward:  -164.95
avg reward:  -167.91
avg reward:  -167.69


KeyboardInterrupt: 

# Adding memory replay

In [None]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from collections import deque
from keras.initializers import RandomUniform
import random

env_name = 'Taxi-v2'

env = gym.make(env_name)
state_space = env.observation_space.n
action_space = env.action_space.n

gamma = 0.999
epsilon = 1.0 # amount of exploration
epsilon_decay = 0.99 # exploration decay
num_games = 1500 
memory = deque(maxlen=500)
minibatch_size = 30

reward_list = deque(maxlen=100)
init = RandomUniform(minval=0.0, maxval = 0.01)

def replay():
    minibatch = random.sample(memory, minibatch_size)
    minibatch = np.array(minibatch)
    not_done_indices = np.where(minibatch[:, 4] == False)
    
    state = np.vstack(minibatch[:, 0])
    action = np.vstack(minibatch[:, 1])

    y = np.copy(minibatch[:, 2])
    
    # If minibatch contains any non-terminal states, use separate update rule for those states
    if len(not_done_indices[0]) > 0:
        predict_sprime = model.predict(np.vstack(minibatch[:, 3]))
        predict_sprime_target = target_model.predict(np.vstack(minibatch[:, 3]))

        y[not_done_indices] += np.multiply(gamma, \
                predict_sprime_target[not_done_indices, \
                np.argmax(predict_sprime[not_done_indices, :][0], axis=1)][0])

    target = model.predict(state)
    actions = np.array(minibatch[:, 1], dtype=int)
    target[range(minibatch_size), actions] = y
    model.fit(state, target, epochs=1, verbose=0)

def create_model():
    model = Sequential()
    #model.add(Dense(10, input_dim=state_space, activation='relu'))
    #model.add(Dense(action_space, activation='linear'))
    model.add(Dense(action_space, input_dim=state_space, activation='linear', kernel_initializer=init))
    model.compile(loss='mse',optimizer=SGD(lr=0.1))
    return model

def target_model_update():
    target_model.set_weights(model.get_weights())

def add_memory(s, a, r, s_prime, done):
    memory.append((s, a, r, s_prime, done))
    
model = create_model()
target_model = create_model()

def state_to_Qvalue(state, model):
    Qvalue = model.predict(state)
    return Qvalue

for game in range(num_games):
    state = env.reset()
    state = np.identity(state_space)[state:state+1] # transforms state into 1-hot-encoding
    epsilon *= epsilon_decay
    done = False
    episode_reward = 0

    while(not done):
        target_Q = state_to_Qvalue(state, model)
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(target_Q)

        state_next, reward, done, _ = env.step(action)
        state_next = np.identity(state_space)[state_next:state_next+1]
        add_memory(state, action, reward, state_next, done)
        episode_reward += reward
        Q1 = state_to_Qvalue(state_next, target_model)

        target_Q[0, action] = reward + gamma*np.max(Q1)
        
        if len(memory) > minibatch_size:
            replay()
        
        state = state_next
        if(done):
            target_model_update()
            reward_list.append(episode_reward)
            if(game%50 == 0):
                print('avg reward: ', np.mean(reward_list))



avg reward:  -866.0
avg reward:  -645.9411764705883
avg reward:  -547.33
avg reward:  -410.26
avg reward:  -329.78
