Net完成以下功能
    >  p, v = f_theta(state), p and v 为操作
    >  var_value = g_theta(var_op, feed_state) 得到 var_op 的值
    >  
    >  update()  更新Net中的theta参数 
        >  loss, optimizer = train(*params) 
    >  save() 保存模型参数
    >  restore() 恢复模型参数

In [2]:
import datetime

import numpy as np
import tensorflow as tf

In [3]:
class Net:
    def __init__(self, size = 3, model_path="./Model_save/model_net_demo.ckpt"):
        self.s = tf.placeholder(dtype=tf.float32,shape=(size,size))
        self.shape= self.s.shape
        self.n_actions = size*size ## +1?
        self.model_path = model_path
        self.p, self.v = self.inference(self.s)
        self.train_vars = tf.trainable_variables()
        self.saver = tf.train.Saver(self.train_vars)
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
   
    def save(self):
        self.saver.save(self.sess,save_path=self.model_path)
        print(str(datetime.datetime.now())+\
              ": latest model and params saved to path:",self.model_path)
        
    def restore(self):
        self.saver.restore(self.sess,save_path=self.model_path)
        print(str(datetime.datetime.now())+\
              ": last model and params restored from path",self.model_path)
        
    def inference(self,inputs,filters=4,fc_units_v=32,num_res_module=4):
        fc_units_p = self.n_actions
        
        inputs = self.input_module(inputs,filters) # 输入模块
        
        mid_values = inputs
        for i_module in range(num_res_module):
            mid_values = self.residual_module(mid_values,filters)
            
        outputs_p, outputs_v = self.output_module(mid_values,fc_units_v,fc_units_p)
        
        
        return outputs_p, outputs_v

    def update(self,v_op_list,p_op_list,z_value,pi_value_list,c=0.01,num_iters=20):
        loss = tf.Variable(0.,trainable=False,dtype=tf.float32)
        for v in v_op_list:
            loss = tf.add(loss,tf.square(v-z_value)) 

        for p,pi in zip(p_op_list,pi_value_list):
            loss = tf.add(loss,-tf.matmul(tf.log(p),tf.transpose(pi))) 

        var = tf.trainable_variables()
        regularizer = tf.contrib.layers.l2_regularizer(scale=c)
        l2_var = tf.contrib.layers.apply_regularization(regularizer,var)
        
        loss = tf.add(loss, l2_var)
        
        self.loss = loss
        self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)
        
        self.sess.run(tf.global_variables_initializer())
        try:
            self.restore()
        except Exception as e:
            print(e.message)
            
        finally:
            for it in range(num_iters):
                self.sess.run(self.optimizer)
                
            self.save()
        
        
    def get_var_value(self,var,feed_dict):
        return self.sess.run(var,feed_dict)
    
    def input_module(self,inputs,filters=256):
        outputs = tf.reshape(inputs,[1,self.shape[0],self.shape[1],-1]) # 后续改进
        outputs = tf.layers.conv2d(outputs,filters=filters,kernel_size=3,padding='same')
        outputs = tf.layers.batch_normalization(outputs)
        outputs = tf.nn.relu(outputs)
        return outputs
    
    def residual_module(self,inputs,filters=256):
        outputs = tf.layers.conv2d(inputs,filters=filters,kernel_size=3,padding='same')
        outputs = tf.layers.batch_normalization(outputs)
        outputs = tf.nn.relu(outputs)

        outputs = tf.layers.conv2d(outputs,filters=filters,kernel_size=3,padding='same')
        outputs = tf.layers.batch_normalization(outputs)

        outputs += inputs
        outputs = tf.nn.relu(outputs)  
        return outputs
    
    def output_module(self,inputs,fc_units_v=256,fc_units_p=19*19+1):
        outputs_v = tf.layers.conv2d(inputs,filters=1,kernel_size=1,padding='same')
        outputs_v = tf.layers.batch_normalization(outputs_v)
        outputs_v = tf.nn.relu(outputs_v)

        outputs_v = tf.layers.flatten(outputs_v)

        outputs_v = tf.layers.dense(outputs_v,fc_units_v,activation=tf.nn.relu)

        outputs_v = tf.layers.dense(outputs_v,1,activation=tf.nn.tanh)
        
        outputs_p = tf.layers.conv2d(inputs,filters=2,kernel_size=1,padding='same')
        outputs_p = tf.layers.batch_normalization(outputs_p)
        outputs_p = tf.nn.relu(outputs_p)
        
        outputs_p = tf.layers.flatten(outputs_p)
        
        outputs_p = tf.layers.dense(outputs_p,fc_units_p,activation=tf.nn.softmax)
        return outputs_p, outputs_v
    
    

In [8]:
net = Net()

In [9]:
net.get_var_value(net.p,{net.s:2*np.ones((3,3),dtype=np.float32)})

array([[0.13194595, 0.12891872, 0.08943845, 0.1054638 , 0.10735194,
        0.08488117, 0.12324006, 0.0879622 , 0.14079775]], dtype=float32)

In [None]:
p1,v1 = net.inference(np.zeros((3,3),dtype=np.float32))
v1_value = net.get_var_value(net.v,{net.s:np.zeros((3,3),dtype=np.float32)})
p1_value = net.get_var_value(net.p,{net.s:np.zeros((3,3),dtype=np.float32)})

p2,v2 = net.inference(np.ones((3,3),dtype=np.float32))
v2_value = net.get_var_value(net.v,{net.s:np.ones((3,3),dtype=np.float32)})
p2_value = net.get_var_value(net.p,{net.s:np.ones((3,3),dtype=np.float32)})

p3,v3 = net.inference(2*np.ones((3,3),dtype=np.float32))
v3_value = net.get_var_value(net.v,{net.s:2*np.ones((3,3),dtype=np.float32)})
p3_value = net.get_var_value(net.p,{net.s:2*np.ones((3,3),dtype=np.float32)})

z_value = -1.

v_op_list = [v1,v2,v3]
p_op_list = [p1,p2,p3]

pi_value_list = [p1_value,p2_value,p3_value]

In [None]:
p2

In [None]:
net.update(v_op_list,p_op_list,z_value,pi_value_list)

In [None]:
net.get_var_value(net.loss,None)