In [1]:
import numpy as np
import tensorflow as tf

from experience_replay import ReplayBuffer
from deep_q_network import DeepQNetwork

  return f(*args, **kwds)
  return f(*args, **kwds)


In [7]:
class RLAgent:

    def __init__(
            self,
            environment,
            num_actions,
            agent_network,
            target_network):
        self.env = environment
        self.num_actions = num_actions
        self.agent_net = agent_network
        self.target_net = target_network

    def set_parameters(
            self,
            replay_buffer_size=100000,
            replay_initial_size=10000,
            start_epsilon=1.0,
            final_epsilon=0.01,
            annealing_steps=100000,
            discount_factor=0.99,
            max_episode_length=1000):

        # create replay buffer
        self.rep_buffer = ReplayBuffer(replay_buffer_size)
        # fill replay buffer with experience generated by random policy
        s = self.env.reset()
        while self.rep_buffer.stored_in_buffer < replay_initial_size:
            a = np.random.randint(self.num_actions)
            s_, r, done = self.env.step(a)[:3]
            self.rep_buffer.push_transition((s, a, r, s_, done))
            if done:
                s = self.env.reset()
            else:
                s = s_
        # define epsilon decay schedule
        self.eps = start_epsilon
        self.final_eps = final_epsilon
        self.delta_eps = (start_epsilon - final_epsilon) / annealing_steps
        # set discount factor and maximum length of the episode
        self.gamma = discount_factor
        self.max_ep_length = max_episode_length
        
    def update_target_net(self):
        update_ops = []
        for v_agnt, v_trgt in zip(self.agent_net.vars, self.target_net.vars):
            update_ops.append(v_trgt.assign(v_agnt))
        self.sess.run(update_ops)

    def train(
            self,
            batch_size,
            agent_update_frequency=4,
            target_update_frequency=5000,
            max_num_episodes=50000):
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
        episode_num = 0
        self.rewards_history = []
        global_time_step = 0
        while episode_num < max_num_episodes:
            
            s = self.env.reset()
            episode_reward = 0
            for time_step in range(self.max_ep_length):
                # pick action epsilon-greedily
                if np.random.rand() < self.eps:
                    a = np.random.randint(self.num_actions)
                else:
                    a = self.agent_net.get_greedy_action(self.sess, [s])
                # make a step in the environment
                s_, r, done = env.step(a)[:3]
                episode_reward += r
                # save transition into replay buffer
                self.rep_buffer.push_transition((s, a, r, s_, done))
                
                # update agent weights
                if global_time_step % agent_update_frequency == 0:
                    batch = self.rep_buffer.get_batch(batch_size)
                    next_actions = self.target_net.get_greedy_action(
                        self.sess, batch.s_)
                    next_q_values = self.target_net.get_q_values(
                        self.sess, batch.s_, next_actions)
                    targets = batch.r + self.gamma * next_q_values * (1-batch.done)
                    loss = self.agent_net.train(
                        self.sess, batch.s, batch.a, targets)
                    
                # update target weights
                if global_time_step % target_update_frequency == 0:
                    self.update_target_net()
                    
                # decay epsilon
                self.eps = max(self.final_eps, self.eps - self.delta_eps)
                
                global_time_step += 1
                
                if done:
                    break
                s = s_

            episode_num += 1
            self.rewards_history.append(episode_reward)
            if episode_num % 500 == 0:
                avg_reward = np.mean(self.rewards_history[-500:])
                print ("Average reward over 500 episodes: {}".format(avg_reward))
                print ("Epsilon: {}".format(self.eps))
                print ("-------------------------")


In [8]:
from snake import Snake
env = Snake(grid_size=(6, 6))
num_actions = 3

In [9]:
tf.reset_default_graph()
agent_net = DeepQNetwork(
    num_actions,
    state_shape=[6, 6, 1],
    convs=[[16, 2, 1], [32, 1, 1]],
    scope="agent")
target_net = DeepQNetwork(
    num_actions,
    state_shape=[6, 6, 1],
    convs=[[16, 2, 1], [32, 1, 1]],
    scope="target")

agent = RLAgent(env, num_actions, agent_net, target_net)

In [10]:
agent.set_parameters()

In [11]:
agent.train(32)

Average reward over 500 episodes: -0.864
Epsilon: 0.9711711999999793
-------------------------
Average reward over 500 episodes: -0.874
Epsilon: 0.9409464999999575
-------------------------
Average reward over 500 episodes: -0.83
Epsilon: 0.9074151999999334
-------------------------
Average reward over 500 episodes: -0.83
Epsilon: 0.873586899999909
-------------------------
Average reward over 500 episodes: -0.794
Epsilon: 0.837392499999883
-------------------------
Average reward over 500 episodes: -0.8
Epsilon: 0.7978617999998545
-------------------------
Average reward over 500 episodes: -0.718
Epsilon: 0.7502724999998203
-------------------------
Average reward over 500 episodes: -0.728
Epsilon: 0.7028217999997861
-------------------------
Average reward over 500 episodes: -0.7
Epsilon: 0.6425802999997428
-------------------------
Average reward over 500 episodes: -0.702
Epsilon: 0.5821011999996992
-------------------------
Average reward over 500 episodes: -0.554
Epsilon: 0.514741

KeyboardInterrupt: 