In [2]:
%pylab inline
import gym
import theano
import keras
import numpy as np
import matplotlib.pyplot as plt
import scipy
import math
import json
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import sgd
from keras.models import model_from_json

Populating the interactive namespace from numpy and matplotlib


# Experience replay Class for DQN

In [3]:
#EXPERIENCE REPLAY

class ExperienceReplay(object):
    
    
    def __init__(self, max_memory=100, discount=.9):
        
        """Define max length of memory and gamma"""
        
        
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remember(self, states, game_over):
        
        
        # memory[i] = [[state_t, action_t, reward_t, state_t+1], game_over?]
        """Add experience to memory"""
        
        
        self.memory.append([states, game_over])
        #Delete the first experience if the memory is too long
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_batch(self, model, batch_size=10):
        
        
        """Get the batch input and targets we will train on"""
        
        
        
        #length of memory vector
        len_memory = len(self.memory)
        
        #number of actions in action space
        num_actions = model.output_shape[-1]
        
        #states is an experience : [input_t_minus_1, action, reward, input_t],
        #so memory[0] is state and memory[0][0][0].shape[1] is the size of the input
        env_dim = self.memory[0][0][0].shape[1]
        
        #if batch_size<len_memory (it is mostly the case), 
        #then input is a matrix with batch_size rows and size of obs columns
        inputs = np.zeros((min(len_memory, batch_size), env_dim))
        
        #targets is a matrix with batch_size rows and number of actions columns
        targets = np.zeros((inputs.shape[0], num_actions))
        
        for i, idx in enumerate(np.random.randint(0, len_memory,
                                                  size=inputs.shape[0])):
            
            #get experience number idx, idx being a random number in [0,length of memory]
            #There are batch_size experiences that are drawn
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            
            #Is the game over ? AKA done in gym
            game_over = self.memory[idx][1]

            #The inputs of the NN are the state of the experience drawn
            inputs[i:i+1] = state_t
            
            # There should be no target values for actions not taken.
            # Thou shalt not correct actions not taken #deep
            # model.predict(state_t)[0] is the vector of Q(state_t) for each action
            targets[i] = model.predict(state_t)[0]
            
            #Q_sa=max_a{Q(s,a)}
            Q_sa = np.max(model.predict(state_tp1)[0])
            
            # if game_over is True then the sequence is terminated 
            if game_over:  # if game_over is True
                targets[i, action_t] = reward_t
            else:
                # the target for this particular experience is : reward_t + gamma * max_a' Q(s', a')
                # We know that you should have : 
                targets[i, action_t] = reward_t + self.discount * Q_sa
        return inputs, targets

# CartPole on OpenAI Gym

In [5]:
#Define the environment
env = gym.make('CartPole-v0')

#PARAMETERS

#learning rate
#learning_rate=0.01

#exploration parameter : need to improve that
epsilon = .95

#decay rate for epsilon
decay_rate=0.85

#Number of possible actions
num_actions = env.action_space.n 

#Number of epochs of training : one epoch is a game ! It ends when you lose
epoch = 5

#Length of memory
max_memory = 400000

#Number of hidden units
hidden_size = 200

#Size of batch for training
batch_size = 32

#Accumulated reward over epoch
acc_reward=0

#shape of observations
observation_shape = env.observation_space.shape[0]

#start recording training part
env.monitor.start('test',force=True,video_callable=lambda count: count % 50 == 0)
#env.monitor.configure(video_callable=lambda count: False)
#env.monitor.start('test',force=True)
                  
                  
#Parameter C
C=0

#RMSProp optimizer
#RMSprop=keras.optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0, clipvalue=1)
#Adam optimizer
Adam=keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, clipvalue=1)

#Define the current DNN
model = Sequential()
#first fully connected layer, activation RELU
model.add(Dense(hidden_size, input_dim=observation_shape, activation='relu'))
#second fully connected layer, activation RELU
model.add(Dense(hidden_size, activation='relu'))
#third fully connected layer, activation RELU
model.add(Dense(hidden_size, activation='relu'))
#last fully connected layer, output Q(s,a,theta)
model.add(Dense(num_actions))
#choose optimization parameters
model.compile(optimizer=Adam, loss='mean_squared_error')

#Define the target DNN
target_model = Sequential()
#first fully connected layer, activation RELU
target_model.add(Dense(hidden_size, input_dim=observation_shape, activation='relu'))
#second fully connected layer, activation RELU
target_model.add(Dense(hidden_size, activation='relu'))
#third fully connected layer, activation RELU
target_model.add(Dense(hidden_size, activation='relu'))
#last fully connected layer, output Q(s,a,theta)
target_model.add(Dense(num_actions))
#choose optimization parameters
target_model.compile(optimizer=Adam, loss='mean_squared_error')

# If you want to continue training from a previous model, just uncomment the line bellow
#model.load_weights("model_cartpole")
#target_model.load_weights("model_cartpole")


# Initialize experience replay object
exp_replay = ExperienceReplay(max_memory=max_memory)

# Train boyyy
#win_cnt = 0
for e in range(epoch):
    #set loss to zero
    loss = 0.
    
    #set accumulated reward to 0
    acc_reward = 0
    
    #Set C to zero
    C=0
    
    #env.reset() : reset the environment, get first observation
    input_t = env.reset()
    input_t = input_t.reshape((1,observation_shape))
    
    #the game starts, so set game_over to False
    game_over = False
    
    # get initial input
    #input_t = env.observe()
    #We already have it with reset
    
    #Decay of epsilon
    if (e+1)%400==0:
        epsilon = epsilon*decay_rate
        if epsilon<0.05:
            epsilon=0.05


    #decay on learning rate
    #if e%50==0:
    #    learning_rate=learning_rate*(decay_rate**3)
    #    if learning_rate<0.0001:
    #        learning_rate=0.0001
    #    model.compile(sgd(lr=learning_rate), "mse")

    while not game_over:
        
        #set this state to be the last state
        input_tm1 = input_t
        
        # get next action according to espilon-greedy policy
        if np.random.rand() <= epsilon:
            #exploration
            action = np.random.randint(0, num_actions, size=1)[0]
        else:
            #exploitation
            q = model.predict(input_tm1)
            action = np.argmax(q[0])

        #apply action, get rewards and new state
        input_t, reward, game_over, infodemerde = env.step(action)
        input_t = input_t.reshape((1,observation_shape))
        
        
        #Accumulate reward
        acc_reward += reward

        # store experience
        exp_replay.remember([input_tm1, action, reward, input_t], game_over)
        
        #Create new target network every C updates, by cloning the current network
        if C%50==0:
            model.save_weights("model_cartpole_TARGET", overwrite=True)
            with open("model_cartpole_TARGET.json", "w") as outfile:
                json.dump(model.to_json(), outfile) 
            target_model.load_weights("model_cartpole_TARGET")
            #print('LAAAA')
            
        #Increment C
        C += 1
        
        # adapt model
        inputs, targets = exp_replay.get_batch(model, batch_size=batch_size)

        loss += model.train_on_batch(inputs, targets)
    print("Epoch {:03d}/999 | Loss {:.4f} | Accumulated reward {:.4f}".format(e, loss, acc_reward))

env.monitor.close()

[2016-12-19 15:27:26,588] Making new env: CartPole-v0
[2016-12-19 15:27:26,594] Clearing 6 monitor files from previous run (because force=True was provided)
[2016-12-19 15:27:26,852] Starting new video recorder writing to /Users/Syzygy/Desktop/MVA/Reinforcement_Learning/Projet/DQN/Deep_Q_Network/Deep_Q_Network/test/openaigym.video.0.4650.video000000.mp4


Epoch 000/999 | Loss 12.2688 | Accumulated reward 23.0000
Epoch 001/999 | Loss 11.9030 | Accumulated reward 22.0000
Epoch 002/999 | Loss 23.0123 | Accumulated reward 44.0000
Epoch 003/999 | Loss 5.6196 | Accumulated reward 10.0000


[2016-12-19 15:27:31,252] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/Syzygy/Desktop/MVA/Reinforcement_Learning/Projet/DQN/Deep_Q_Network/Deep_Q_Network/test')


Epoch 004/999 | Loss 11.4141 | Accumulated reward 17.0000


In [52]:
#Save trained model weights and architecture, this will be used by the visualization code
model.save_weights("model_cartpole", overwrite=True)
with open("model_cartpole.json", "w") as outfile:
    json.dump(model.to_json(), outfile)

In [59]:
# PLAYING PART
 
with open("model_cartpole.json", "r") as jfile:
    model = model_from_json(json.load(jfile))
model.load_weights("model_cartpole")
model.compile("sgd", "mse")

#Define the environment
env = gym.make('CartPole-v0')

#start recording
env.monitor.start('test',force=True)


for e in range(1000):
    
    #set loss to 0
    loss = 0.
    
    #env.reset() : reset the environment, get first observation
    input_t = env.reset()
    input_t = input_t.reshape((1,4))
    
    #Game starts so not over
    game_over = False
    
    while not game_over:
        
        #set this state to be the last state
        input_tm1 = input_t
        
        env.render()

        # get next action
        q = model.predict(input_tm1)
        action = np.argmax(q[0])

        # apply action, get rewards and new state
        input_t, reward, game_over, infodemerde = env.step(action)
        input_t = input_t.reshape((1,4))
        
        
env.monitor.close()

[2016-12-01 17:59:02,346] Making new env: CartPole-v0
[2016-12-01 17:59:02,353] Starting new video recorder writing to /Users/Syzygy/Desktop/MVA/Reinforcement Learning/DQN/test/openaigym.video.9.49841.video000000.mp4
[2016-12-01 17:59:04,292] Starting new video recorder writing to /Users/Syzygy/Desktop/MVA/Reinforcement Learning/DQN/test/openaigym.video.9.49841.video000001.mp4
[2016-12-01 17:59:07,156] Starting new video recorder writing to /Users/Syzygy/Desktop/MVA/Reinforcement Learning/DQN/test/openaigym.video.9.49841.video000008.mp4
[2016-12-01 17:59:12,061] Starting new video recorder writing to /Users/Syzygy/Desktop/MVA/Reinforcement Learning/DQN/test/openaigym.video.9.49841.video000027.mp4
[2016-12-01 17:59:20,295] Starting new video recorder writing to /Users/Syzygy/Desktop/MVA/Reinforcement Learning/DQN/test/openaigym.video.9.49841.video000064.mp4
[2016-12-01 17:59:39,596] Starting new video recorder writing to /Users/Syzygy/Desktop/MVA/Reinforcement Learning/DQN/test/openaigy

In [7]:
#UPLOADING RESULTS

gym.upload('/.../...', api_key='XXXXXXXXXXXXXXXX')

[2016-12-02 10:11:25,857] [CartPole-v0] Uploading 150 episodes of training data
[2016-12-02 10:11:28,027] [CartPole-v0] Uploading videos of 6 training episodes (43709 bytes)
[2016-12-02 10:11:29,263] [CartPole-v0] Creating evaluation object from /Users/Syzygy/Desktop/MVA/Reinforcement Learning/DQN/test with learning curve and training video
[2016-12-02 10:11:29,937] 
****************************************************
You successfully uploaded your evaluation on CartPole-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_gWrrMzaKS0GYcrRAUYsQWQ

****************************************************
