The OpenAI Gym (https://gym.openai.com) provides us with a lot of different examples and games in which to train a learning agent. The task is to develop one of such agents. We will create a neural network that, given the state of the game (actually, two consecutive states), it outputs a family of quality values (Q-values) for each next possible move. The move with higher Q-value is chosen and performed in the game. This theoretical formalism was taken from https://www.nervanasys.com/demystifying-deep-reinforcement-learning/

In [1]:
# INITIALIZATION: libraries, parameters, network...

from keras.models import Sequential      # One layer after the other
from keras.layers import Dense, Flatten  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from keras.optimizers import adam
from collections import deque            # For storing moves 
from tqdm import tqdm_notebook as tqdm
import numpy as np
import time
import gym                                # To train our network
env = gym.make('MountainCar-v0')          # Choose game (any in the gym should work)

import random     # For sampling batches from the observations


# Create network. Input is two consecutive game states, output is Q-values of the possible moves.
model = Sequential()
model.add(Dense(24, input_shape=(2,) + env.observation_space.shape, init='uniform', activation='relu'))
#model.add(Flatten())       # Flatten input so as to have no problems with processing
model.add(Dense(48, init='uniform', activation='relu'))
model.add(Dense(24, init='uniform', activation='relu'))
model.add(Dense(env.action_space.n, init='uniform', activation='linear'))    # Same number of outputs as possible actions

model.compile(loss='mse', optimizer=adam(lr=0.005), metrics=['accuracy'])
model.summary()

# Parameters
maxEpisodes = 1000                               
memory_size=100000                         # Register where the actions will be stored
steps = 500                            # Number of timesteps we will be acting on the game 
epsilon = 1.0                              # Probability of doing a random move
eps_min = 0.01
eps_decay = 0.995
gamma = 0.9                                # Discounted future reward. How much we care about steps further in time
mb_size = 64                               # Learning minibatch size
pretrain_length=1000                     # training steps done for pre-populating memory
#observeSteps = 10                           #number of times to observe before learning

Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2, 24)             72        
_________________________________________________________________
dense_2 (Dense)              (None, 2, 48)             1200      
_________________________________________________________________
dense_3 (Dense)              (None, 2, 24)             1176      
_________________________________________________________________
dense_4 (Dense)              (None, 2, 3)              75        
Total params: 2,523
Trainable params: 2,523
Non-trainable params: 0
_________________________________________________________________




#### Creating Circular memory class

In [10]:
from collections import deque

class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

#### Pre-populating memory

In [11]:
# Initialize the simulation
observation = env.reset()                     # Game begins
obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
state = np.stack((obs, obs), axis=1)
done = False
memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):

    # Make a random action
    action = env.action_space.sample()
    observation_new, reward, done, _ = env.step(action)
    obs_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
    next_state = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1) 
    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        observation=env.reset()           # Restart game if it's finished
        obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
        state = np.stack((obs, obs), axis=1)
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        state = next_state

#### Trainning model

In [13]:
pbar = tqdm(total = maxEpisodes)#progress bar setup
for _ in range(maxEpisodes):    
    observation = env.reset()                     # Game begins
    obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
    state = np.stack((obs, obs), axis=1)
    done = False
    eps=epsilon
    pbar.update(1)#updating progress bar
    ###pbar1 = tqdm(total = observetime)#progress bar setup
    for step in range(steps):
        ###pbar1.update(1)#updating progress bar
        #observing
        eps*=eps_decay
        eps=max(eps_min,eps)
        if np.random.rand() <= eps:
            action = np.random.randint(0, env.action_space.n, size=1)[0]
        else:
            Q = model.predict(state)          # Q-values predictions
            action = np.argmax(Q)             # Move with highest Q-value is the chosen one
        observation_new, reward, done, info = env.step(action)     # See state of the game, reward... after performing the action
        obs_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
        next_state = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1)     # Update the input with the new state of the game
        if done:
            # The simulation fails so no next state
            next_state = np.zeros(state.shape)
            # Add experience to memory
            memory.add((state, action, reward, next_state, done))

            observation=env.reset()           # Restart game if it's finished
            obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
            state = np.stack((obs, obs), axis=1)
        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state, done))
            state = next_state
        # learning
        # Sample mini-batch from memory
        batch = memory.sample(mb_size)
        states = np.array([each[0][0] for each in batch])
        actions = np.array([each[1] for each in batch])
        rewards = np.array([each[2] for each in batch])
        next_states = np.array([each[3][0] for each in batch])
        dones = np.array([each[4] for each in batch])
        # Build Bellman equation for the Q function
        #targets = np.array([model.predict(state) for state in states ])
        targets = model.predict(states)
        Q_sa = model.predict(next_states)
        for i in range(len(targets)):
            if dones[i]:
                targets[i] = rewards[i]
            else:
                targets[i] = rewards[i]+gamma*np.max(Q_sa[i])
        # Train network to output the Q function
        model.train_on_batch(states, targets)
    ###pbar1.close()
pbar.close()#closing progress bar
print('Learning Finished')

KeyboardInterrupt: 

In [14]:
# THIRD STEP: Play!
env = gym.make('MountainCar-v0') 
observation = env.reset()
obs = np.expand_dims(observation, axis=0)
state = np.stack((obs, obs), axis=1)
done = False
tot_reward = 0.0
while not done:
    #env.render()                    # Uncomment to see game running
    Q = model.predict(state)        
    action = np.argmax(Q)         
    observation, reward, done, info = env.step(action)
    obs = np.expand_dims(observation, axis=0)
    state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)    
    tot_reward += reward
print('Game ended! Total reward: {}'.format(tot_reward))
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Game ended! Total reward: -200.0


In [2]:
import gym
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

from collections import deque

class DQN:
    def __init__(self, env):
        self.env     = env
        self.memory  = deque(maxlen=2000)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model   = Sequential()
        state_shape  = self.env.observation_space.shape
        model.add(Dense(24, input_dim=state_shape[0], activation="relu"))
        model.add(Dense(48, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.env.action_space.n))
        model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

def main():
    env     = gym.make("MountainCar-v0")
    gamma   = 0.9
    epsilon = .95

    trials  = 1000
    trial_len = 500

    # updateTargetNetwork = 1000
    dqn_agent = DQN(env=env)
    steps = []
    for trial in range(trials):
        cur_state = env.reset().reshape(1,2)
        tot_reward = 0
        for step in range(trial_len):
            action = dqn_agent.act(cur_state)
            new_state, reward, done, _ = env.step(action)

            # reward = reward if not done else -20
            new_state = new_state.reshape(1,2)
            dqn_agent.remember(cur_state, action, reward, new_state, done)
            
            dqn_agent.replay()       # internally iterates default (prediction) model
            dqn_agent.target_train() # iterates target model
            tot_reward += reward
            cur_state = new_state
            if done:
                break
        if tot_reward <= -199:
            print("Failed to complete in trial {}".format(trial))
            if step % 10 == 0:
                dqn_agent.save_model("trial-{}.model".format(trial))
        else:
            print("Completed in {} trials".format(trial))
            dqn_agent.save_model("success.model")
            break

In [3]:
if __name__ == "__main__":
    main()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Failed to complete in trial 0
Failed to complete in trial 1
Failed to complete in trial 2
Failed to complete in trial 3
Failed to complete in trial 4
Failed to complete in trial 5
Failed to complete in trial 6
Failed to complete in trial 7
Failed to complete in trial 8
Failed to complete in trial 9
Failed to complete in trial 10
Failed to complete in trial 11
Failed to complete in trial 12
Failed to complete in trial 13
Failed to complete in trial 14
Failed to complete in trial 15
Failed to complete in trial 16
Failed to complete in trial 17
Failed to complete in trial 18
Failed to complete in trial 19
Failed to complete in trial 20
Failed to complete in trial 21
Failed to complete in trial 22
Failed to complete in trial 23
Failed to complete in trial 24
Failed to complete in trial 25
Failed to complete in trial 26
Failed to complete in trial 27
Failed to complete in trial 28
Fai

In [13]:
from keras.models import load_model
env = gym.make('MountainCar-v0') 
state = env.reset().reshape(1,2)
model2 = load_model('success.model')
done = False
tot_reward = 0.0
while not done:
    #env.render()                    # Uncomment to see game running
    Q = model2.predict(state)        
    action = np.argmax(Q)         
    state, reward, done, info = env.step(action)
    state = state.reshape(1,2)  
    tot_reward += reward
print('Game ended! Total reward: {}'.format(tot_reward))
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Game ended! Total reward: -200.0
