In [1]:
import tensorflow as tf
import gym
import numpy as np

In [2]:
class PolicyAgent:
    def __init__(self, num_acts, num_features, num_units=10, learning_rate=0.01, decay=0.95):
        self.n_actions = num_acts
        self.n_features = num_features
        self.learning_rate = learning_rate
        self.decay = decay
        
        self.ep_obs, self.ep_acts, self.ep_rew = [], [], []
        
        self._build(num_units)
        self.sess = tf.Session()
        
        
        self.sess.run(tf.global_variables_initializer())
    
    def _build(self, num_units):
        # Input Info
        self.obs = tf.placeholder(tf.float32, (None, self.n_features))
        self.acts = tf.placeholder(tf.int32, (None,))
        self.rew = tf.placeholder(tf.float32, (None,))
        
        # Model Layers
        h1 = tf.layers.dense(self.obs, num_units, activation=tf.nn.tanh, 
                             kernel_initializer=tf.random_normal_initializer())
        h2 = tf.layers.dense(h1, num_units, activation=tf.nn.tanh, 
                             kernel_initializer=tf.random_normal_initializer())
        out = tf.layers.dense(h2, self.n_actions, activation=None, 
                             kernel_initializer=tf.random_normal_initializer())
        
        self.probabilities = tf.nn.softmax(out)
      #  self.action_chooser = tf.multinomial(probabilities,1)
        
        neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.probabilities, labels=self.acts)
        loss = tf.reduce_mean(neg_log_prob * self.rew)
        
        self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)
    
    def choose_action(self,obs):
        prob_weights = self.sess.run(self.probabilities, feed_dict={self.obs:np.array(obs).reshape(-1,self.n_features)})
        action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())
        return action
    
    def store(self, obs, action, reward):
        self.ep_obs.append(obs)
        self.ep_acts.append(action)
        self.ep_rew.append(reward)
        
    def process_rewards(self):
        discounted_rewards = np.zeros_like(self.ep_rew)
        run_sum = 0
        for i in reversed(range(len(self.ep_rew))):
            run_sum *= self.decay
            run_sum += self.ep_rew[i]
            discounted_rewards[i] = run_sum
        
        discounted_rewards -= np.mean(discounted_rewards)
        if np.std(discounted_rewards) != 0:
            discounted_rewards /= np.std(discounted_rewards)
        return discounted_rewards
    
    def train(self):
        discounted_rewards = self.process_rewards()
        
        self.sess.run(self.train_op,feed_dict={self.obs: np.array(self.ep_obs).reshape(-1,self.n_features),
                                          self.acts: np.array(self.ep_acts),
                                          self.rew: np.array(discounted_rewards)})
        
        self.ep_obs, self.ep_acts, self.ep_rew = [], [], []
        
        return discounted_rewards    

In [3]:
env = gym.make('CartPole-v1')
agent = PolicyAgent(num_acts=env.action_space.n,num_features=env.observation_space.shape[0],
                    num_units=30,learning_rate=0.0001, decay=0.99)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
saver = tf.train.Saver()

In [5]:
saver.restore(agent.sess,'./cart_models/model.ckpt')

INFO:tensorflow:Restoring parameters from ./cart_models/model.ckpt


In [8]:
save_path = saver.save(agent.sess, './cart_models/model.ckpt')

In [6]:
max_episodes = 1000000000000
#max_steps = 100000000

In [None]:
avg = []
for episode_num in range(max_episodes):
    obs_prev = env.reset()
    done = False
    while not done:
#         env.render()
        action = agent.choose_action(obs_prev)
        obs, reward, done, _ = env.step(action)
        
        agent.store(obs_prev, action, reward)
        
        if done:
            
            reward_list = agent.train()
            avg.append(len(reward_list))
            #print(len(reward_list), end=', ')
            if len(avg) == 1000:
                mean = np.mean(avg)
                print(mean, end=', ')
                avg = []
                save_path = saver.save(agent.sess, './cart_models/model.ckpt')
            break
            
        obs_prev = obs

471.084, 463.221, 472.047, 479.964, 478.497, 473.127, 475.979, 473.948, 459.202, 474.59, 480.84, 473.335, 472.37, 481.456, 471.217, 476.657, 473.651, 468.482, 479.783, 477.35, 470.839, 470.046, 476.89, 478.128, 475.543, 475.005, 476.844, 480.17, 475.758, 464.278, 472.1, 471.247, 458.989, 467.127, 469.829, 466.555, 471.182, 474.007, 463.562, 474.723, 474.725, 478.3, 476.114, 474.222, 468.155, 465.88, 468.904, 474.245, 476.65, 478.465, 465.469, 468.902, 473.315, 468.092, 469.177, 463.575, 474.976, 475.831, 468.76, 468.734, 471.056, 475.028, 469.857, 469.509, 468.93, 467.985, 468.784, 471.849, 462.678, 465.925, 465.072, 462.802, 467.008, 468.273, 468.598, 467.77, 460.865, 465.588, 467.497, 471.979, 474.678, 464.586, 473.796, 459.602, 460.055, 469.174, 468.553, 469.489, 472.462, 473.702, 462.946, 474.98, 481.765, 480.228, 479.385, 477.874, 477.398, 464.908, 466.715, 471.328, 477.469, 473.031, 476.471, 474.126, 472.917, 480.76, 475.531, 471.402, 464.657, 462.896, 466.984, 467.595, 471.535, 