In [1]:
import tensorflow as tf
import gym
import numpy as np

In [8]:
class PolicyAgent:
    def __init__(self, num_acts, num_features, num_units=10, learning_rate=0.01, decay=0.95):
        self.n_actions = num_acts
        self.n_features = num_features
        self.learning_rate = learning_rate
        self.decay = decay
        
        self.ep_obs, self.ep_acts, self.ep_rew = [], [], []
        
        self._build(num_units)
        self.sess = tf.Session()
        
        
        self.sess.run(tf.global_variables_initializer())
    
    def _build(self, num_units):
        # Input Info
        self.obs = tf.placeholder(tf.float32, (None, self.n_features))
        self.acts = tf.placeholder(tf.int32, (None,))
        self.rew = tf.placeholder(tf.float32, (None,))
        self.training = tf.placeholder(tf.bool)
        
        # Model Layers
        h1 = tf.layers.dense(self.obs, num_units, activation=tf.nn.relu, 
                             kernel_initializer=tf.random_normal_initializer())
        h2 = tf.layers.dense(h1, num_units, activation=tf.nn.relu, 
                             kernel_initializer=tf.random_normal_initializer())
        dropout = tf.layers.dropout(h2, training=self.training)
        out = tf.layers.dense(dropout, self.n_actions, activation=None, 
                             kernel_initializer=tf.random_normal_initializer())
        
        self.probabilities = tf.nn.softmax(out)
      #  self.action_chooser = tf.multinomial(probabilities,1)
        
        neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=out, labels=self.acts)
        loss = tf.reduce_mean(neg_log_prob * self.rew)
        
        self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)
    
    def choose_action(self,obs):
        prob_weights = self.sess.run(self.probabilities, feed_dict={
            self.obs:np.array(obs).reshape(-1,self.n_features),
            self.training:False})
        action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())
        return action
    
    def store(self, obs, action, reward):
        self.ep_obs.append(obs)
        self.ep_acts.append(action)
        self.ep_rew.append(reward)
        
    def process_rewards(self):
        discounted_rewards = np.zeros_like(self.ep_rew)
        run_sum = 0
        for i in reversed(range(len(self.ep_rew))):
            run_sum *= self.decay
            run_sum += self.ep_rew[i]
            discounted_rewards[i] = run_sum
        
        discounted_rewards -= np.mean(discounted_rewards)
        if np.std(discounted_rewards) != 0:
            discounted_rewards /= np.std(discounted_rewards)
        return discounted_rewards
    
    def train(self):
        discounted_rewards = self.process_rewards()
        
        self.sess.run(self.train_op,feed_dict={self.obs: np.array(self.ep_obs).reshape(-1,self.n_features),
                                          self.acts: np.array(self.ep_acts),
                                          self.rew: np.array(discounted_rewards),
                                          self.training:True})
        
        self.ep_obs, self.ep_acts, self.ep_rew = [], [], []
        
        return discounted_rewards    

In [9]:
env = gym.make('Breakout-ram-v0')
#env = gym.make('CartPole-v1')
agent = PolicyAgent(num_acts=env.action_space.n,num_features=env.observation_space.shape[0],
                    num_units=30,learning_rate=0.001, decay=0.99)

In [10]:
max_episodes = 1000000000000
#max_steps = 100000000

In [5]:
saver = tf.train.Saver()

In [6]:
saver.restore(agent.sess,'./.breakout/model.ckpt')

INFO:tensorflow:Restoring parameters from ./breakout_models/model.ckpt


In [9]:
save_path = saver.save(agent.sess, './.breakout/model.ckpt')

In [11]:
avg = []
for episode_num in range(max_episodes):
    obs_prev = env.reset()
    obs, reward, done, _ = env.step(env.action_space.sample())
    obs_array = [obs_prev,obs]
    done = False
    while not done:
        observation = obs_array[0] - obs_array[1]
        
        env.render()
        action = agent.choose_action(observation)
        obs, reward, done, _ = env.step(action)
        agent.store(observation, action, reward)
        obs_array[0],obs_array[1] = obs_array[1], obs
        
        if done:
            
            reward_list = agent.train()
            avg.append(np.sum(reward_list))
#             print('{0:7f}'.format(np.sum(reward_list)), end=', ')
            if len(avg) == 100:
                mean = np.mean(avg)
                print(mean, end=', ')
                save_path = saver.save(agent.sess, './breakout_models/model.ckpt')
                avg = []
            break
            
#         obs_prev = obs

KeyboardInterrupt: 