The OpenAI Gym (https://gym.openai.com) provides us with a lot of different examples and games in which to train a learning agent. The task is to develop one of such agents. We will create a neural network that, given the state of the game (actually, two consecutive states), it outputs a family of quality values (Q-values) for each next possible move. The move with higher Q-value is chosen and performed in the game. This theoretical formalism was taken from https://www.nervanasys.com/demystifying-deep-reinforcement-learning/

In [214]:
# INITIALIZATION: libraries, parameters, network...

from keras.models import Sequential      # One layer after the other
from keras.layers import Dense, Flatten  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from keras.optimizers import adam
from collections import deque            # For storing moves 
from tqdm import tqdm_notebook as tqdm
import numpy as np
import time
import gym                                # To train our network
env = gym.make('MountainCar-v0')          # Choose game (any in the gym should work)

import random     # For sampling batches from the observations


# Create network. Input is two consecutive game states, output is Q-values of the possible moves.
model = Sequential()
model.add(Dense(24, input_shape=(2,) + env.observation_space.shape, init='uniform', activation='relu'))
model.add(Flatten())       # Flatten input so as to have no problems with processing
model.add(Dense(48, init='uniform', activation='relu'))
model.add(Dense(24, init='uniform', activation='relu'))
model.add(Dense(env.action_space.n, init='uniform', activation='linear'))    # Same number of outputs as possible actions

model.compile(loss='mse', optimizer=adam(lr=0.005), metrics=['accuracy'])
model.summary()

# Parameters
maxEpisodes = 1000                               
memory_size=100000                         # Register where the actions will be stored
steps = 500                            # Number of timesteps we will be acting on the game 
epsilon = 1.0                              # Probability of doing a random move
eps_min = 0.01
eps_decay = 0.995
gamma = 0.9                                # Discounted future reward. How much we care about steps further in time
mb_size = 64                               # Learning minibatch size
pretrain_length=1000                     # training steps done for pre-populating memory
#observeSteps = 10                           #number of times to observe before learning

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_113 (Dense)            (None, 2, 24)             72        
_________________________________________________________________
flatten_29 (Flatten)         (None, 48)                0         
_________________________________________________________________
dense_114 (Dense)            (None, 48)                2352      
_________________________________________________________________
dense_115 (Dense)            (None, 24)                1176      
_________________________________________________________________
dense_116 (Dense)            (None, 3)                 75        
Total params: 3,675
Trainable params: 3,675
Non-trainable params: 0
_________________________________________________________________




#### Creating Circular memory class

In [198]:
from collections import deque

class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

#### Pre-populating memory

In [215]:
# Initialize the simulation
observation = env.reset()                     # Game begins
obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
state = np.stack((obs, obs), axis=1)
done = False
memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):

    # Make a random action
    action = env.action_space.sample()
    observation_new, reward, done, _ = env.step(action)
    obs_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
    next_state = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1) 
    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        observation=env.reset()           # Restart game if it's finished
        obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
        state = np.stack((obs, obs), axis=1)
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        state = next_state

#### Trainning model

In [218]:
pbar = tqdm(total = maxEpisodes)#progress bar setup
for _ in range(maxEpisodes):    
    observation = env.reset()                     # Game begins
    obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
    state = np.stack((obs, obs), axis=1)
    done = False
    eps=epsilon
    pbar.update(1)#updating progress bar
    ###pbar1 = tqdm(total = observetime)#progress bar setup
    for step in range(steps):
        pbar1.update(1)#updating progress bar
        #observing
        eps*=eps_decay
        eps=max(eps_min,eps)
        if np.random.rand() <= eps:
            action = np.random.randint(0, env.action_space.n, size=1)[0]
        else:
            Q = model.predict(state)          # Q-values predictions
            action = np.argmax(Q)             # Move with highest Q-value is the chosen one
        observation_new, reward, done, info = env.step(action)     # See state of the game, reward... after performing the action
        obs_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
        next_state = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1)     # Update the input with the new state of the game
        if done:
            # The simulation fails so no next state
            next_state = np.zeros(state.shape)
            # Add experience to memory
            memory.add((state, action, reward, next_state, done))

            observation=env.reset()           # Restart game if it's finished
            obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
            state = np.stack((obs, obs), axis=1)
        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state, done))
            state = next_state
        # learning
        # Sample mini-batch from memory
        batch = memory.sample(mb_size)
        states = np.array([each[0][0] for each in batch])
        actions = np.array([each[1] for each in batch])
        rewards = np.array([each[2] for each in batch])
        next_states = np.array([each[3][0] for each in batch])
        dones = np.array([each[4] for each in batch])
        # Build Bellman equation for the Q function
        #targets = np.array([model.predict(state) for state in states ])
        targets = model.predict(states)
        Q_sa = model.predict(next_states)
        for i in range(len(targets)):
            if dones[i]:
                targets[i] = rewards[i]
            else:
                targets[i] = rewards[i]+gamma*np.max(Q_sa[i])
        # Train network to output the Q function
        model.train_on_batch(states, targets)
    pbar1.close()
pbar.close()#closing progress bar
print('Learning Finished')

HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))

Learning Finished


In [231]:
# THIRD STEP: Play!
env = gym.make('MountainCar-v0') 
observation = env.reset()
obs = np.expand_dims(observation, axis=0)
state = np.stack((obs, obs), axis=1)
done = False
tot_reward = 0.0
while not done:
    env.render()                    # Uncomment to see game running
    Q = model.predict(state)        
    action = np.argmax(Q)         
    observation, reward, done, info = env.step(action)
    obs = np.expand_dims(observation, axis=0)
    state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)    
    tot_reward += reward
print('Game ended! Total reward: {}'.format(tot_reward))
env.close()



[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Game ended! Total reward: -200.0
