In [1]:
from naiveAI import AgentNaive, NMnaive
import tensorflow as tf
import numpy as np
from copy import deepcopy
from buffer import PrioritizedReplayBuffer


In [2]:
sess = tf.InteractiveSession()

In [3]:
class EnvMahjong:
    """
    An example mahjong environment for agent to interact with.
    """
    def __init__(self):
        pass
    
    def reset(self):
        self.turn = 0
        init_state = np.random.randint(low=0, high=2, size=[1, 34, 4, 1]).astype(np.float32)
        return init_state
        
        
    def step(self, action):
        """
        param: action is an action generated by the agent (agent.select()), 
            action=None means there is no available action, just go to next state
        """
        
        next_state = np.random.randint(low=0, high=2, size=[1, 34, 4, 1]).astype(np.float32)
        score = 0.  # if next_state 胡了, score = 胡的分数, else score = 0
        
        if self.turn >= 100:
            done = 1 # done=1 means this game terminates
        else:
            done = 0
            
        info = {'turn': self.turn} # other information not included in state
        
        self.turn += 1
        
        return next_state, score, done, info
    
    def get_aval_actions(self):
        N = np.random.randint(low=1, high=13)
        if N == 0:  # no available actions
            next_aval_states = None
        else:
            next_aval_states = np.random.randint(low=0, high=2, size=[N, 34, 4, 1]).astype(np.float32)
                
        return next_aval_states
    
    

In [4]:
nn = NMnaive(sess)
env = EnvMahjong()
memory = PrioritizedReplayBuffer(state_dim=34*4, action_dim=34)

agent = AgentNaive(nn, memory)

In [5]:
n_games = 2

for n in range(n_games):
    done = 0
    this_state = env.reset()
    
    step = 0
    
    while not done and step < 10000:

        next_aval_states = env.get_aval_actions()
        action, policy = agent.select(next_aval_states)

        next_state, score, done, info = env.step(action)

        agent.remember(this_state, action, next_state, score, done, next_aval_states, policy)
        agent.learn()
        
        this_state = deepcopy(next_state)
        step += 1
        
        print("Game {}, step {}".format(n, step))
        print(info)
        
        

Game 0, step 1
{'turn': 0}
Game 0, step 2
{'turn': 1}
Game 0, step 3
{'turn': 2}
Game 0, step 4
{'turn': 3}
Game 0, step 5
{'turn': 4}
Game 0, step 6
{'turn': 5}
Game 0, step 7
{'turn': 6}
Game 0, step 8
{'turn': 7}
Game 0, step 9
{'turn': 8}
Game 0, step 10
{'turn': 9}
Game 0, step 11
{'turn': 10}
Game 0, step 12
{'turn': 11}
Game 0, step 13
{'turn': 12}
Game 0, step 14
{'turn': 13}
Game 0, step 15
{'turn': 14}
Game 0, step 16
{'turn': 15}
Game 0, step 17
{'turn': 16}
Game 0, step 18
{'turn': 17}
Game 0, step 19
{'turn': 18}
Game 0, step 20
{'turn': 19}
Game 0, step 21
{'turn': 20}
Game 0, step 22
{'turn': 21}
Game 0, step 23
{'turn': 22}
Game 0, step 24
{'turn': 23}
Game 0, step 25
{'turn': 24}
Game 0, step 26
{'turn': 25}
Game 0, step 27
{'turn': 26}
Game 0, step 28
{'turn': 27}
Game 0, step 29
{'turn': 28}
Game 0, step 30
{'turn': 29}
Game 0, step 31
{'turn': 30}
Game 0, step 32
{'turn': 31}
Game 0, step 33
{'turn': 32}
Game 0, step 34
{'turn': 33}
Game 0, step 35
{'turn': 34}
Game