In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

# 状态多臂赌博机
引入状态变量，即哪一台赌博机

In [2]:
class ContextBandit:
    def __init__(self):
        # 测试
        self.bandits = np.array([[0.8,0,2.3,-5],[0.1,-5,3.2,2.2],[-5,3.5,5,5]])
        self.bandit_nums = self.bandits.shape[0]
        self.action_nums = self.bandits.shape[1]
    
    def getBandit(self):
        self.state = np.random.randint(0, self.bandit_nums)
        return self.state
    
    def pullArm(self, action):
        bandit = self.bandits[self.state, action]
        ret = np.random.randn(1)
        if ret > bandit:
            reward = 1
        else:
            reward = -1
        return reward

# 状态的Agent建模
首先对有限状态输入做one hot编码，输出节点（action）权重系数设置为1，代表选择的行动，损失函数即优化该权重

In [3]:
class StateAgent:
    def __init__(self, lr, state_size, action_size):
        # 根据训练函数获得推荐动作
        self.state = tf.placeholder(dtype=tf.int32, shape=[1])
        state_oh = tf.one_hot(self.state, state_size)
        net = tf.layers.dense(state_oh, action_size, activation=tf.sigmoid, kernel_initializer=tf.ones_initializer())
        self.out = tf.reshape(net, shape=[-1])
        self.chosen_action = tf.argmax(self.out, 0)
        
        # 进行训练优化
        self.reward_holder = tf.placeholder(dtype=tf.float32, shape=[1])
        self.action_holder = tf.placeholder(dtype=tf.int32, shape=[1])
        self.response_weight = tf.slice(self.out, self.action_holder, size=[1])
        self.loss = -(tf.log(self.response_weight)*self.reward_holder)
        optimize = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimize.minimize(self.loss)

# 训练学习

In [4]:
tf.reset_default_graph()
cBandit = ContextBandit()
agent = StateAgent(lr=0.001, state_size=cBandit.bandit_nums, action_size=cBandit.action_nums)
total_episodes = 10000
total_reward = np.zeros([cBandit.bandit_nums, cBandit.action_nums])
e = 0.1

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for i in range(total_episodes):
        s = cBandit.getBandit()
        
        if np.random.randn(1)<e:
            action = np.random.randint(0, cBandit.action_nums)
        else:
            action = sess.run(agent.chosen_action, feed_dict={agent.state:[s]})
        
        reward = cBandit.pullArm(action)
        sess.run(agent.update, feed_dict={agent.action_holder:[action], agent.reward_holder:[reward], agent.state:[s]})
        total_reward[s, action] += reward

In [5]:
for s in range(cBandit.bandit_nums):
    print (np.argmax(total_reward[s]) == np.argmin(cBandit.bandits[s]))

True
True
True


# 打折函数
从回报序列中得到回退前N步的累积打折回报

In [6]:
gamma = 0.99
def discount_rewards(r):
    discount_r = np.zeros_like(r)
    add_term = 0
    for idx in reversed(range(r.size)):
        discount_r[idx] = add_term*gamma + r[idx]
        add_term = discount_r[idx]
    return discount_r

# 序列Agent建模
输入和输出变量多添加一个时间维度，行为策略输出相比之前需要做相应的改变。网络中加入隐层提高建模能力，

In [7]:
class SeqAgent:
    def __init__(self, lr, state_size, hidden_size, action_size):
        self.state_holder = tf.placeholder(dtype=tf.float32, shape=[None, state_size])
        hidden = tf.layers.dense(self.state_holder, hidden_size, activation=tf.nn.relu)
        self.out = tf.layers.dense(hidden, action_size, activation=tf.nn.softmax)
        self.chosen_action = tf.argmax(self.out, 1)
        
        self.action_holder = tf.placeholder(dtype=tf.int32, shape=[None])
        self.reward_holder = tf.placeholder(dtype=tf.float32, shape=[None])
        indices = tf.range(0, tf.shape(self.out)[0])*tf.shape(self.out)[1] + self.action_holder
        self.response_weight = tf.gather(tf.reshape(self.out, [-1]), indices)
        self.loss = -tf.reduce_mean(tf.log(self.response_weight)*self.reward_holder)
        
        optimize = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimize.minimize(self.loss)

In [8]:
import gym
env = gym.make('CartPole-v0')

tf.reset_default_graph()
agent = SeqAgent(lr=0.01, state_size=4, hidden_size=8, action_size=2)

total_episode = 1000
total_reward = []
total_length = []

max_exp = 999

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    
    for i in range(total_episode):
        s = env.reset()
        
        reward = 0
        exp_history = []
        # 运行过程
        for j in range(max_exp):
            # 从网络输出中选择可能的最佳行为，在利用网络和随机之间的权衡
            a_dist = sess.run(agent.out, feed_dict={agent.state_holder: [s]})
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)
            s1, r, done, _ = env.step(a)
            exp_history.append([s,a,r,s1])
            s = s1
            reward += r
            
            if done==True:
                # 仿真结束，获取序列数据，更新网络
                exp_history = np.array(exp_history)
                # reward
                exp_history[:,2] = discount_rewards(exp_history[:,2])
                feed_dict = {agent.reward_holder: exp_history[:,2],
                            agent.action_holder: exp_history[:,1],
                            agent.state_holder: np.vstack(exp_history[:,0])}
                sess.run(agent.update, feed_dict)

                total_reward.append(reward)
                total_length.append(j)
                if (i%100) == 0:
                    print np.mean(total_reward[-100:]), np.mean(total_length[-100:])
                break 

[2017-06-28 22:11:31,556] Making new env: CartPole-v0
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


20.0 19.0
47.54 46.54
114.52 113.52
165.14 164.14
190.56 189.56
119.41 118.41
30.83 29.83
106.41 105.41
163.45 162.45
195.84 194.84
