In [1]:
from tic_tac_pg import policy_gradients

pg_model = policy_gradients(lr=0.001,seed=42,batch_size=30,max_iter=5)

## 1. Check that we can infer actions: 

In [2]:
import numpy as np
import tensorflow as tf

Z = np.zeros((3,3))

zero_pad_state = np.pad(Z,(1,1),'constant')

iter_ = 10

with tf.Session() as sess:
    
    sess.run(pg_model.init_g)
    
    for i in range(30):
        
            count = 0
                        
            while iter_ != 0:
                
                iter_ -= 1
                
                Z = np.zeros((3,3))
                
                zero_pad_state = np.pad(Z,(1,1),'constant')
                
                
                feed = {pg_model.X_t:zero_pad_state.reshape((1,5,5,1)),
                        pg_model.state:Z.flatten().reshape((1,9))}
                
                ## add the agent's move:
                action = sess.run(pg_model.sample_action,feed_dict=feed)
                
                print(action)

[[[0. 0. 0. 0. 0. 1. 0. 0. 0.]]]
[[[1. 0. 0. 0. 0. 0. 0. 0. 0.]]]
[[[1. 0. 0. 0. 0. 0. 0. 0. 0.]]]
[[[1. 0. 0. 0. 0. 0. 0. 0. 0.]]]
[[[0. 0. 1. 0. 0. 0. 0. 0. 0.]]]
[[[0. 0. 0. 0. 0. 1. 0. 0. 0.]]]
[[[0. 0. 0. 0. 0. 0. 1. 0. 0.]]]
[[[0. 0. 0. 1. 0. 0. 0. 0. 0.]]]
[[[0. 0. 0. 0. 0. 0. 0. 1. 0.]]]
[[[0. 0. 0. 0. 0. 0. 0. 1. 0.]]]


### We can infer actions. 

## 2. Check that we can load the tic-tac-toe system: 

In [3]:
import numpy as np
import tensorflow as tf
from tic_tac_learning import simulator
from tic_tac_pg import policy_gradients
from clever_stochastician import clever_stochastician as CS
from tic_tac_system import tic_tac_system
from evaluation import game_evaluation as G

tf.reset_default_graph()

G = G()

pg_model = policy_gradients(lr=0.001,seed=42,batch_size=30,max_iter=5)

tic_tac = tic_tac_system(G,model=pg_model,opponent=CS,epochs=2000,depth=5,gamma=0.9)


### We can load the tic-tac-toe system. 

## 3. Check that we can initialise the tic-tac-toe system:

In [4]:
 with tf.Session() as sess:
        
    ### initialise the variables:
    sess.run(tic_tac.model.init_g)
    sess.run(tic_tac.model.init_l)

## 4. Check that we can produce a rollout:

In [5]:
def simulator(tic_tac):    
        
    with tf.Session() as sess:
        
        ### initialise the variables:
        sess.run(tic_tac.model.init_g)
        sess.run(tic_tac.model.init_l)
        
        rollouts, rewards = tic_tac.rollouts(sess) 
        
        print(rewards)
            
  
simulator(tic_tac)

[[-5.         -2.23606798 -1.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.  