In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

# 状态多臂赌博机
引入状态变量，即哪一台赌博机

In [None]:
class ContextBandit:
    def __init__(self):
        # 测试
        self.bandits = np.array([[0.8,0,2.3,-5],[0.1,-5,3.2,2.2],[-5,3.5,5,5]])
        self.bandit_nums = self.bandits.shape[0]
        self.action_nums = self.bandits.shape[1]
    
    def getBandit(self):
        self.state = np.random.randint(0, self.bandit_nums)
        return self.state
    
    def pullArm(self, action):
        bandit = self.bandits[self.state, action]
        ret = np.random.randn(1)
        if ret > bandit:
            reward = 1
        else:
            reward = -1
        return reward

# 状态的Agent建模
首先对有限状态输入做one hot编码，输出节点（action）权重系数设置为1，代表选择的行动，损失函数即优化该权重

In [4]:
class StateAgent:
    def __init__(self, lr, state_size, action_size):
        # 根据训练函数获得推荐动作
        self.state = tf.placeholder(dtype=tf.int32, shape=[1])
        state_oh = tf.one_hot(self.state, state_size)
        net = tf.layers.dense(state_oh, action_size, activation=tf.sigmoid, kernel_initializer=tf.ones_initializer())
        self.out = tf.reshape(net, shape=[-1])
        self.chosen_action = tf.argmax(self.out, 0)
        
        # 进行训练优化
        self.reward_holder = tf.placeholder(dtype=tf.float32, shape=[1])
        self.action_holder = tf.placeholder(dtype=tf.int32, shape=[1])
        self.response_weight = tf.slice(self.out, self.action_holder, size=[1])
        self.loss = -(tf.log(self.response_weight)*self.reward_holder)
        optimize = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimize.minimize(self.loss)

# 训练学习

In [None]:
tf.reset_default_graph()
cBandit = ContextBandit()
agent = StateAgent(lr=0.001, state_size=cBandit.bandit_nums, action_size=cBandit.action_nums)
total_episodes = 10000
total_reward = np.zeros([cBandit.bandit_nums, cBandit.action_nums])
e = 0.1

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for i in range(total_episodes):
        s = cBandit.getBandit()
        
        if np.random.randn(1)<e:
            action = np.random.randint(0, cBandit.action_nums)
        else:
            action = sess.run(agent.chosen_action, feed_dict={agent.state:[s]})
        
        reward = cBandit.pullArm(action)
        sess.run(agent.update, feed_dict={agent.action_holder:[action], agent.reward_holder:[reward], agent.state:[s]})
        total_reward[s, action] += reward

In [None]:
for s in range(cBandit.bandit_nums):
    print (np.argmax(total_reward[s]) == np.argmin(cBandit.bandits[s]))

# 打折函数
从回报序列中得到回退前N步的累积打折回报

In [134]:
gamma = 0.99
def discount_rewards(r):
    discount_r = np.zeros_like(r)
    add_term = 0
    for idx in reversed(range(r.size)):
        discount_r[idx] = add_term*gamma + r[idx]
        add_term = discount_r[idx]
    return discount_r

# 序列Agent建模
输入和输出变量多添加一个时间维度，行为策略输出相比之前需要做相应的改变。网络中加入隐层提高建模能力，

In [208]:
class SeqAgent:
    def __init__(self, lr, state_size, hidden_size, action_size):
        self.state_holder = tf.placeholder(dtype=tf.float32, shape=[None, state_size])
        hidden = tf.layers.dense(self.state_holder, hidden_size, activation=tf.nn.relu, bias_initializer=None, kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.out = tf.layers.dense(hidden, action_size, activation=tf.nn.softmax, bias_initializer=None ,kernel_initializer=tf.contrib.layers.xavier_initializer())
        #hidden = tf.layers.dense(self.state_holder, hidden_size, activation=tf.nn.relu)
        #self.out = tf.layers.dense(hidden, action_size, activation=tf.nn.softmax)
        #hidden = slim.fully_connected(self.state_holder, hidden_size, activation_fn=tf.nn.relu,biases_initializer=None)
        #self.out = slim.fully_connected(hidden, action_size, activation_fn=tf.nn.softmax,biases_initializer=None)
        #hidden = tf.contrib.layers.fully_connected(self.state_holder, hidden_size, activation_fn=tf.nn.relu, biases_initializer=None)
        #self.out = tf.contrib.layers.fully_connected(hidden, action_size, activation_fn=tf.nn.softmax, biases_initializer=None)
        self.chosen_action = tf.argmax(self.out, 1)
        
        self.action_holder = tf.placeholder(dtype=tf.int32, shape=[None])
        self.reward_holder = tf.placeholder(dtype=tf.float32, shape=[None])
        indices = tf.range(0, tf.shape(self.out)[0])*tf.shape(self.out)[1] + self.action_holder
        self.response_weight = tf.gather(tf.reshape(self.out, [-1]), indices)
        self.loss = -tf.reduce_mean(tf.log(self.response_weight)*self.reward_holder)
        
        optimize = tf.train.AdamOptimizer(learning_rate=lr)
        self.update2 = optimize.minimize(self.loss)

        # 优化策略梯度
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for ix,var in enumerate(tvars):
            placeholder = tf.placeholder(dtype=tf.float32, name=str(ix)+"holder")
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss, tvars)
        
        opt = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = opt.apply_gradients(zip(self.gradient_holders,tvars))

In [266]:
import gym
env = gym.make('CartPole-v0')

tf.reset_default_graph()
agent = SeqAgent(lr=0.01, state_size=4, hidden_size=8, action_size=2)

total_episode = 5000
total_reward = []
total_length = []

max_exp = 2000

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    
    # 获得梯度缓存（系数）：置为0？
    grad_buff = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(grad_buff):
        grad_buff[ix] = grad*0
    
    for i in range(total_episode):
        s = env.reset()
        
        reward = 0
        exp_history = []
        # 运行过程
        for j in range(max_exp):
            # 从网络中获取最佳行为
            a = sess.run(agent.chosen_action, feed_dict={agent.state_holder: [s]})
            #a = np.random.choice(a_dist[0],p=a_dist[0])
            #a = np.argmax(a_dist == a)
            #print a
            #a = env.action_space.sample()
            s1, r, done, _ = env.step(a[0])
            exp_history.append([s,a,r,s1])
            s = s1
            reward += r
            
            if done==True:
                # 仿真结束，获取序列数据，更新网络
                # TODO: 需要好的反馈，现在有些随机
                exp_history = np.array(exp_history)
                # reward
                exp_history[:,2] = discount_rewards(exp_history[:,2])
                feed_dict = {agent.reward_holder: exp_history[:,2],
                            agent.action_holder: exp_history[:,1],
                            agent.state_holder: np.vstack(exp_history[:,0])}
                
                sess.run(agent.update2, feed_dict)
                """
                grads = sess.run(agent.gradients, feed_dict)
                for ix, grad in enumerate(grads):
                    grad_buff[ix] += grad
                
                if i%5==0 and i!=0:
                    feed_dict = dict(zip(agent.gradient_holders, grad_buff))
                    _ = sess.run(agent.update, feed_dict)
                    for ix, grad in enumerate(grads):
                        grad_buff[ix] = grad*0
                """
                total_reward.append(reward)
                print np.mean(total_reward[-100:]), j
                #if (i%100) == 0:
                 #   print np.mean(total_reward[-100:]), j
                    #print total_reward
                break
                
                                                                      
                
            

[2017-06-15 08:57:14,308] Making new env: CartPole-v0


# 对比

In [267]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range
env = gym.make('CartPole-v0')

[2017-06-15 09:08:02,929] Making new env: CartPole-v0


In [268]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [269]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [270]:
tf.reset_default_graph() #Clear the Tensorflow graph.

myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.

total_episodes = 5000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_lenght = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)

            s1,r,d,_ = env.step(a) #Get our reward for taking an action given a bandit.
            ep_history.append([s,a,r,s1])
            s = s1
            running_reward += r
            if d == True:
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_lenght.append(j)
                break

        
            #Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1

13.0
23.17
25.32
31.41
42.54
63.52
111.03
154.08
174.01
175.05
188.69


KeyboardInterrupt: 