In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

# 基于状态的多臂赌博机

In [None]:
class ContextBandit:
    def __init__(self):
        # 测试
        self.bandits = np.array([[0.8,0,2.3,-5],[0.1,-5,3.2,2.2],[-5,3.5,5,5]])
        self.bandit_nums = self.bandits.shape[0]
        self.action_nums = self.bandits.shape[1]
    
    def getBandit(self):
        self.state = np.random.randint(0, self.bandit_nums)
        return self.state
    
    def pullArm(self, action):
        bandit = self.bandits[self.state, action]
        ret = np.random.randn(1)
        if ret > bandit:
            reward = 1
        else:
            reward = -1
        return reward

# 基于神经网络的Agent建模

In [None]:
class Agent:
    def __init__(self, lr, state_size, action_size):
        # 根据训练函数获得推荐动作
        self.state = tf.placeholder(dtype=tf.int32, shape=[1])
        state_oh = tf.one_hot(self.state, state_size)
        net = tf.layers.dense(state_oh, action_size, activation=tf.sigmoid, kernel_initializer=tf.ones_initializer())
        self.out = tf.reshape(net, shape=[-1])
        self.chosen_action = tf.argmax(self.out, 0)
        
        # 进行训练优化
        self.reward_holder = tf.placeholder(dtype=tf.float32, shape=[1])
        self.action_holder = tf.placeholder(dtype=tf.int32, shape=[1])
        self.response_weight = tf.slice(self.out, self.action_holder, size=[1])
        self.loss = -(tf.log(self.response_weight)*self.reward_holder)
        optimize = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimize.minimize(self.loss)

# 训练学习

In [None]:
tf.reset_default_graph()
cBandit = ContextBandit()
agent = Agent(lr=0.001, state_size=cBandit.bandit_nums, action_size=cBandit.action_nums)
total_episodes = 10000
total_reward = np.zeros([cBandit.bandit_nums, cBandit.action_nums])
e = 0.1

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for i in range(total_episodes):
        s = cBandit.getBandit()
        
        if np.random.randn(1)<e:
            action = np.random.randint(0, cBandit.action_nums)
        else:
            action = sess.run(agent.chosen_action, feed_dict={agent.state:[s]})
        
        reward = cBandit.pullArm(action)
        sess.run(agent.update, feed_dict={agent.action_holder:[action], agent.reward_holder:[reward], agent.state:[s]})
        total_reward[s, action] += reward

In [None]:
for s in range(cBandit.bandit_nums):
    print (np.argmax(total_reward[s]) == np.argmin(cBandit.bandits[s]))

In [1]:
# 打折函数，从回报序列中得到回退前N步的累积打折回报
gamma = 0.9
def discount(r):
    discount_r = np.zeros_like(r)
    add_term = 0
    for idx in reversed(range(r.size)):
        discount_r[idx] = add_term*gamma + r[idx]
        add_term = discount_r[idx]
    return discount_r

In [2]:
class SeqAgent:
    def __init__(self, lr, state_size, hidden_size, action_size):
        self.state_holder = tf.placeholder(dtype=tf.float32, shape=[None, state_size])
        hidden = tf.layers.dense(self.state_holder, hidden_size, activation=tf.nn.relu)
        self.out = tf.layers.dense(hidden, action_size, activation=tf.nn.sigmoid)
        self.chosen_action = tf.argmax(self.out, 1)
        
        self.action_holder = tf.placeholder(dtype=tf.int32, shape=[None])
        self.reward_holder = tf.placeholder(dtype=tf.float32, shape=[None])
        indices = tf.range(0, tf.shape(self.out)[0])*tf.shape(self.out)[1] + self.action_holder
        self.response_weight = tf.gather(tf.reshape(self.out, [-1]), indices)
        self.loss = -tf.reduce_mean(tf.log(self.response_weight)*self.reward_holder)

        # 优化策略梯度
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for ix,var in enumerate(tvars):
            placeholder = tf.placeholder(dtype=tf.float32, name=str(ix)+"holder")
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss, tvars)
        
        opt = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = opt.apply_gradients(zip(self.gradient_holders,tvars))

In [3]:
import gym
env = gym.make('CartPole-v0')

tf.reset_default_graph()
agent = SeqAgent(lr=0.001, state_size=4, hidden_size=8, action_size=2)

total_episode = 1000
total_reward = []
total_length = []

max_exp = 1000

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    
    # 获得梯度缓存
    grad_buff = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(grad_buff):
        grad_buff[ix] = grad*0
    
    for i in range(total_episode):
        s = env.reset()
        
        exp_history = []
        # 运行过程
        for j in range(max_exp):
            # 从网络中获取最佳行为,TODO
            a = env.action_space.sample()
            s1, r, done, _ = env.step(a)
            exp_history.append([s,a,r,s1])
            s = s1
            
            if done==True:
                # 获取序列数据，更新网络
                exp_history = np.array(exp_history)
                # reward
                exp_history[:,2] = discount_rewards(exp_history[:,2])
                feed_dict = {agent.reward_holder: exp_history[:,2],
                            agent.action_holder: exp_history[:,1],
                            agent.state_holder: np.vstack(exp_history[:,0])}
                grads = sess.run(agent.gradients, feed_dict)
                for ix,grad in enumerate(grads):
                    grads[ix] += grad
                
                break
                                                                      
                
            

[2017-06-13 09:07:53,806] Making new env: CartPole-v0


NameError: name 'tf' is not defined