## Introduction .. 

This notebook mimics the imeplmentation of Siraj's notebook hereunder .. 
https://github.com/llSourcell/deep_q_learning/blob/master/03_PlayingAgent.ipynb

In [35]:
from keras.models import Sequential      # One layer after the other
from keras.layers import Dense, Flatten  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from collections import deque            # For storing moves 

import numpy as np
import gym  

import random 

## warnings
import warnings
warnings.filterwarnings("ignore")

In [36]:
env = gym.make('MountainCar-v0')          # Choose game (any in the gym should work)

In [37]:
env

<TimeLimit<MountainCarEnv<MountainCar-v0>>>

## Building the model: 

Create network using Keras..
Input is two consecutive game states, output is Q-values of the possible moves.


In [38]:
model = Sequential()
model.add(Dense(20, input_shape=(2,) + env.observation_space.shape, init='uniform', activation='relu'))
model.add(Flatten()) # Flatten input so as to have no problems with processing
model.add(Dense(18, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(env.action_space.n, init='uniform', activation='linear'))    # Same number of outputs as possible actions

In [39]:
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [40]:
## Init vars ..

D = deque()                                # Register where the actions will be stored
observetime = 500                          # Number of timesteps we will be acting on the game and observing results
epsilon = 0.7                              # Probability of doing a random move
gamma = 0.9                                # Discounted future reward. How much we care about steps further in time
mb_size = 50                               # Learning minibatch size

## 1. Observation phase:
Knowing what each action does (Observing)

In [41]:
observation = env.reset()                     # Game begins
obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
state = np.stack((obs, obs), axis=1)
done = False

print obs
print state

[[-0.59319545  0.        ]]
[[[-0.59319545  0.        ]
  [-0.59319545  0.        ]]]


In [42]:
for t in range(observetime):
    if np.random.rand() <= epsilon:
        action = np.random.randint(0, env.action_space.n, size=1)[0]
    else: #Q_learning
        Q = model.predict(state)          # Q-values predictions
        action = np.argmax(Q)             # Move with highest Q-value is the chosen one
        
    observation_new, reward, done, info = env.step(action)     # See state of the game, reward... after performing the action
    obs_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
    state_new = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1)     # Update the input with the new state of the game
    D.append((state, action, reward, state_new, done))         # 'Remember' action and consequence
    state = state_new         # Update state
    if done:
        env.reset()           # Restart game if it's finished
        obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
        state = np.stack((obs, obs), axis=1)
        
state

array([[[-0.52951826, -0.00276455],
        [-0.5267537 , -0.00378822]]])

## 2. Experience Reply:
Learning from the observations

In [43]:
minibatch = random.sample(D, mb_size)
minibatch

[(array([[[-0.50946858,  0.01168701],
          [-0.52115558,  0.01070533]]]),
  1,
  -1.0,
  array([[[-0.49788751,  0.01158106],
          [-0.50946858,  0.01168701]]]),
  False),
 (array([[[-0.45920599,  0.01058325],
          [-0.46978924,  0.01098507]]]),
  0,
  -1.0,
  array([[[-0.45010269,  0.0091033 ],
          [-0.45920599,  0.01058325]]]),
  False),
 (array([[[-0.3260864 ,  0.00365553],
          [-0.32974194,  0.00602888]]]),
  0,
  -1.0,
  array([[[-0.32482704,  0.00125937],
          [-0.3260864 ,  0.00365553]]]),
  False),
 (array([[[-0.52511013,  0.00976419],
          [-0.53487432,  0.00867964]]]),
  2,
  -1.0,
  array([[[-0.51433461,  0.01077552],
          [-0.52511013,  0.00976419]]]),
  False),
 (array([[[ -5.02947025e-01,   1.46344345e-04],
          [ -5.03093369e-01,  -6.99962086e-04]]]),
  1,
  -1.0,
  array([[[ -5.02955470e-01,  -8.44471655e-06],
          [ -5.02947025e-01,   1.46344345e-04]]]),
  False),
 (array([[[-0.5094851 ,  0.00061807],
          [-0.510

In [44]:
inputs_shape = (mb_size,) + state.shape[1:]
inputs_shape

(50, 2, 2)

In [45]:
inputs = np.zeros(inputs_shape)

In [46]:
targets = np.zeros((mb_size, env.action_space.n))
np.shape(targets)

(50, 3)

In [47]:
## selecting from D

for i in range(0, mb_size):
    state = minibatch[i][0]
    action = minibatch[i][1]
    reward = minibatch[i][2]
    state_new = minibatch[i][3]
    done = minibatch[i][4]
    
    # Build Bellman equation for the Q function
    inputs[i:i+1] = np.expand_dims(state, axis=0)
    targets[i] = model.predict(state)
    Q_sa = model.predict(state_new)
    
    if done:
        targets[i, action] = reward
    else:
        targets[i, action] = reward + gamma * np.max(Q_sa)
        
    # Train network to output the Q function
    model.train_on_batch(inputs, targets)
print "Learning is done"

Learning is done


## 3. Play!

In [48]:
observation = env.reset()
obs = np.expand_dims(observation, axis=0)
state = np.stack((obs, obs), axis=1)
done = False
tot_reward = 0.0

while not done:
    env.render()                    # Uncomment to see game running
    Q = model.predict(state)        
    action = np.argmax(Q)         
    observation, reward, done, info = env.step(action)
    obs = np.expand_dims(observation, axis=0)
    state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)    
    tot_reward += reward
print('Game ended! Total reward: {}'.format(reward))

Game ended! Total reward: -1.0
