In [None]:
#Taken from - https://learning.oreilly.com/library/view/hands-on-machine-learning/9781492032632/ch18.html#mdp_diagram

In [11]:
import numpy as np

In [12]:
transition_probabilities = [[[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],
                           [[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],
                           [None, [0.8, 0.1, 0.1], None]]
rewards = [[[+10, 0, 0], [0, 0, 0], [0, 0, 0]],
           [[0, 0, 0], [0, 0, 0], [0, 0, -50]],
           [[0, 0, 0], [+40, 0, 0], [0, 0, 0]]]
possible_actions = [[0,1,2], [0,2], [1]]

In [16]:
Q_values = np.full((3,3), -np.inf)
for state, action in enumerate(possible_actions):
    Q_values[state, action] = 0.0

In [27]:
gamma = 0.9

for iteration in range(50):
    Q_prev = Q_values.copy()
    for s in range(3):
        for a in possible_actions[s]:
            Q_values[s,a] = np.sum(transition_probabilities[s][a][sp]*(rewards[s][a][sp]+gamma*np.max(Q_prev[sp])) for sp in range(3))

  Q_values[s,a] = np.sum(transition_probabilities[s][a][sp]*(rewards[s][a][sp]+gamma*np.max(Q_prev[sp])) for sp in range(3))


In [29]:
Q_values[0]

array([18.91891892, 17.02702703, 13.62162162])

In [28]:
Q_values

array([[18.91891892, 17.02702703, 13.62162162],
       [ 0.        ,        -inf, -4.87971488],
       [       -inf, 50.13365013,        -inf]])

In [31]:
np.argmax(Q_values, axis = 1)

array([0, 0, 1], dtype=int64)

In [46]:
def step(state, action):
    probas = transition_probabilities[state][action]
    next_state = np.random.choice([0,1,2], p = probas)
    reward = rewards[state][action][next_state]
    return next_state, reward

In [47]:
def exploration_policy(state):
    return np.random.choice(possible_actions[state])

In [52]:
Q_values = np.full((3,3), -np.inf)
for state, action in enumerate(possible_actions):
    Q_values[state, action] = 0.0

In [53]:
alpha0 = 0.05
decay = 0.005
gamma = 0.90
state = 0

for iteration in range(10000):
    action = exploration_policy(state)
    next_state, reward = step(state, action)
    next_value = np.max(Q_values[next_state])
    alpha = alpha0/(1+iteration*decay)
    Q_values[state,action] *= 1 - alpha
    Q_values[state, action] += alpha*(reward + gamma*next_value)
    state = next_state

In [51]:
Q_values

array([[19.56960729, 17.95631982, 14.27309556],
       [ 0.        ,        -inf, -4.14235746],
       [       -inf, 51.23008388,        -inf]])

DQN Algorithm

In [63]:
import gym
import tensorflow as tf

Using TensorFlow backend.


In [68]:
env = gym.make("CartPole-v0")
input_shape = [4]
n_outputs = 2

In [69]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = "elu", input_shape = input_shape),
    tf.keras.layers.Dense(32, activation = "elu"),
    tf.keras.layers.Dense(n_outputs)
])

In [79]:
def epsilon_greedy_policy(state, epsilon = 0):
    if np.random.rand()<epsilon:
        return np.random.randint(2)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [80]:
from collections import deque
replay_buffer = deque(maxlen=2000)

In [89]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size = batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_state, dones = [
        np.array([experience[field_index] for experience in batch])
    for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [90]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_buffer.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [None]:
batch_size = 32
discount_factor = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    