GoDemo
    >  Net 深度神经网络
    >  MCTS 蒙特卡洛树搜索
    >  Simulator 环境模拟器
    >  Model 自动化的、流水线的、能自检修复的深度强化学习的模型

Net完成以下功能
    >  p, v = f_theta(state), p and v 为操作
    >  var_value = g_theta(var_op, feed_state) 得到 var_op 的值
    >  loss, optimizer = train(*params) 
    >  update()  更新Net中的theta参数
    >  save() 保存模型参数
    >  restore() 恢复模型参数

MCTS完成以下功能
    >  pi = policy(net,state)
    >  action = choice(pi)
    >  tree = rollout(state)

Simulator完成以下功能
    >  ndarray_inputs = transform(state)
    >  next_state = move(state,action)
    >  is_done_bool = is_done(state)
    >  result = is_win(state)
    >  render()
    >  save_record()
    >  reload_record()

Model 完成以下功能
    >  start()
    >  auto_play()
    >  emergency_help()
    >  end()

In [1]:
import numpy as np
import tensorflow as tf

In [11]:
class Net:
    def __init__(self,size=2,model_path="./Model_save/model_net.ckpt"):
        self.s = tf.placeholder(dtype=tf.float32,shape=(size,size))
        self.shape=(size,size)
        self.model_path = model_path
        self.p, self.v = self.inference(self.s)
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def get_var_value(self,var,feed_state):
        return self.sess.run(var,{self.s:feed_state})
    
#     def __call__(self,data):
#         self.reset(data)
#         return self
        
#     def reset(self,data):
#         self.s = tf.constant(data,dtype=tf.float32)
#         self.shape = data.shape
#         self.p, self.v = self.inference(self.s)
        
    def input_module(self,inputs,filters=256):
        outputs = tf.reshape(inputs,[1,self.shape[0],self.shape[1],-1]) # 后续改进
        outputs = tf.layers.conv2d(outputs,filters=filters,kernel_size=3,padding='same')
        outputs = tf.layers.batch_normalization(outputs)
        outputs = tf.nn.relu(outputs)
        return outputs
    
    def residual_module(self,inputs,filters=256):
        outputs = tf.layers.conv2d(inputs,filters=filters,kernel_size=3,padding='same')
        outputs = tf.layers.batch_normalization(outputs)
        outputs = tf.nn.relu(outputs)

        outputs = tf.layers.conv2d(outputs,filters=filters,kernel_size=3,padding='same')
        outputs = tf.layers.batch_normalization(outputs)

        outputs += inputs
        outputs = tf.nn.relu(outputs)  
        return outputs
    
    def output_module(self,inputs,fc_units_v=256,fc_units_p=19*19+1):
        outputs_v = tf.layers.conv2d(inputs,filters=1,kernel_size=1,padding='same')
        outputs_v = tf.layers.batch_normalization(outputs_v)
        outputs_v = tf.nn.relu(outputs_v)

        outputs_v = tf.layers.flatten(outputs_v)

        outputs_v = tf.layers.dense(outputs_v,fc_units_v,activation=tf.nn.relu)

        outputs_v = tf.layers.dense(outputs_v,1,activation=tf.nn.tanh)
        
        outputs_p = tf.layers.conv2d(inputs,filters=2,kernel_size=1,padding='same')
        outputs_p = tf.layers.batch_normalization(outputs_p)
        outputs_p = tf.nn.relu(outputs_p)
        
        outputs_p = tf.layers.flatten(outputs_p)
        
        outputs_p = tf.layers.dense(outputs_p,fc_units_p,activation=tf.nn.softmax)
        return outputs_p, outputs_v
    
    def inference(self,inputs,filters=4,fc_units_v=32,fc_units_p=4,num_res_module=4):
        inputs = self.input_module(inputs,filters)
        
        mid_values = inputs
        
        for i_module in range(num_res_module):
            mid_values = self.residual_module(mid_values,filters)
            
        outputs_p, outputs_v = self.output_module(mid_values,fc_units_v,fc_units_p)
        
        self.train_vars = tf.trainable_variables()
        self.saver = tf.train.Saver(self.train_vars)

        return outputs_p, outputs_v
    
#     def policy(self,state):
#         action_probs = dict()
#         value = None
        
#         p, v = self.inference(state)
#         with tf.Session() as sess:
#             sess.run(tf.global_variables_initializer())
#             p_, value = sess.run([p,v])
#             print(p_)
            
    def pre_train(self,v_list,p_list,z,mcts_list,c=0.0001):
        loss = tf.Variable(0.,trainable=False,dtype=tf.float32)
        print("1",loss)
        for v in v_list:
            loss = tf.add(loss,tf.square(v-z)) 

        for p,m in zip(p_list,mcts_list):
            loss = tf.add(loss,-tf.matmul(tf.log(p),m)) 

        var = tf.trainable_variables()
#         print(len(var))
        regularizer = tf.contrib.layers.l2_regularizer(scale=c)
        l2_var = tf.contrib.layers.apply_regularization(regularizer,var)
        
        loss = tf.add(loss, l2_var)
        
        print("2",loss)
        

        self.loss = loss
        self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)


In [12]:
net = Net()

In [13]:
v_list = [net.v,net.v,net.v,net.v]
p_list = [net.p,net.p,net.p]
z = tf.constant(1.0,dtype=tf.float32) 
const_p = tf.constant([0.2,0.1,0.6,0.1],dtype=tf.float32)
const_p = tf.reshape(const_p,shape=[-1,1])
mcts_list = [const_p,const_p,const_p]

In [14]:
net.pre_train(v_list,p_list,z,mcts_list,c=0.0001)

1 <tf.Variable 'Variable:0' shape=() dtype=float32_ref>
2 Tensor("Add_7:0", shape=(1, 1), dtype=float32)


In [8]:
net.get_var_value(net.p,np.zeros((2,2),dtype=np.float32))

array([[0.25, 0.25, 0.25, 0.25]], dtype=float32)