In [1]:
from aiFrost2 import AgentFrost2, MahjongNetFrost2
import tensorflow as tf
import numpy as np
from copy import deepcopy
from buffer import MahjongBufferFrost2
import MahjongPy as mp
from wrapper import EnvMahjong2
import scipy.io as sio
from datetime import datetime

now = datetime.now()
datetime_str = now.strftime("%Y%m%d-%H%M%S")

graphs = [tf.Graph(), tf.Graph(), tf.Graph(), tf.Graph() ]

env = EnvMahjong2()

num_tile_type = env.matrix_feature_size[0]
num_each_tile = env.matrix_feature_size[1]
num_vf = env.vector_feature_size

agents = [AgentFrost2(nn=MahjongNetFrost2(graphs[i], agent_no=i, num_tile_type=num_tile_type, num_each_tile=num_each_tile, num_vf=num_vf),
                      memory=MahjongBufferFrost2(size=1024, num_tile_type=num_tile_type, num_each_tile=num_each_tile, num_vf=num_vf),
                      greedy=10.0 ** np.random.uniform(-1, 1),
                      num_tile_type=num_tile_type, num_each_tile=num_each_tile, num_vf=num_vf)
          for i in range(4)]


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


##  以下的代码可以让Agent读取保存的网络， 如果comment掉就可以让Agent从头开始训练

In [None]:
## example 
# model_path =  "../log/Agent0-20190501-175203-Game0/naiveAI.ckpt"
# for i in range(4):
#     agents[i].nn.restore(model_path)
    

# Note:

### This is for AI agents those only cares about itself, i.e., no defense. Therefore, there is no negative reward.

### Also, 能和则和，能立直则立直

In [None]:

n_games = 1000000

print("Start!")

for n in range(n_games):
    
    if n % 10000 == 0:
        for i in range(4):
            agents[i].nn.save(model_dir= "Agent{}-".format(i) + datetime_str + "-Game{}".format(n))  # save network parameters every 10000 episodes
    print("\r Game {}".format(n), end='')

    episode_dones = [[], [], [], []]
    episode_states = [[], [], [], []]
    episode_rewards = [[], [], [], []]
    episode_policies = [[], [], [], []]
    
    done = 0
#     policies = np.zeros([4,], dtype=np.int32)
    actions = np.zeros([4,], dtype=np.int32)
    rs = np.zeros([4,], dtype=np.float32)
    
    this_states = env.reset()  ## for all players
    
    next_aval_states = deepcopy(this_states)
    next_states = [[], [], [], []]
    
    step = 0
    
    while not done and step < 10000:

        who, what = env.who_do_what()
        
        ## make selection
        if what == "play":
            
            ######################## Draw a tile #####################
            
            next_states[who], r, done, _ = env.step_draw(playerNo=who)
            
            episode_dones[who].append(done)
            episode_states[who].append(this_states[who])
            episode_rewards[who].append(r)
            episode_policies[who].append(np.array([1.])) # only 1 available action (draw)
            
            this_states[who] = deepcopy(next_states[who])
            
            ###################### Play a tile #######################
            ###### 能和则和，能立直则立直 ############
            aval_actions = env.t.get_self_actions()
            good_actions = []
#             for a in range(len(aval_actions)):
#                 if aval_actions[a].action == mp.Action.Riichi:
#                     good_actions.append(a)
                    
#                 if aval_actions[a].action == mp.Action.Tsumo:
#                     good_actions.append(a)
            #######################################
            
            next_aval_states = env.get_aval_next_states(who)  ## for a single player
            
            if len(good_actions) > 0:
                good_actions = np.reshape(good_actions, [-1, ])
                a_in_good_as, policy = agents[who].select([next_aval_states[0][good_actions], next_aval_states[1][good_actions]])
                action = good_actions[a_in_good_as]
            else:
                action, policy = agents[who].select(next_aval_states)
            
            next_states[who], r, done, _ = env.step_play(action, playerNo=who)
            
            next_states[who] = env.get_state_(who)
            
            episode_dones[who].append(done)
            episode_states[who].append(this_states[who])
            episode_rewards[who].append(r)
            episode_policies[who].append(policy) # only 1 available action (draw)
            
            this_states[who] = deepcopy(next_states[who])
            
#             step += 2
            
        elif what == "response":
            policies = [[], [], [], []]
            for i in range(4):
                next_aval_states = env.get_aval_next_states(i)
                
                ######################## 能和则和，能立直则立直 ##############
                aval_actions = env.t.get_response_actions()
                good_actions = []
#                 for a in range(len(aval_actions)):
#                     if aval_actions[a].action == mp.Action.Ron:
#                         good_actions.append(a)
                
#                     if aval_actions[a].action == mp.Action.ChanKan:
#                         good_actions.append(a)

#                     if aval_actions[a].action == mp.Action.ChanAnKan:
#                         good_actions.append(a)
                ##########################################################
                if len(good_actions) > 0:
                    good_actions = np.reshape(good_actions, [-1, ])
                    a_in_good_as, policies[i] = agents[i].select([next_aval_states[0][good_actions], next_aval_states[1][good_actions]])
                    actions[i] = good_actions[a_in_good_as]
                else:
                    actions[i], policies[i] = agents[i].select(next_aval_states)
                
                next_states[i], rs[i], done, _ = env.step_response(actions[i], playerNo=i)
                
                ## Note: next_states is agent's prediction, but not the true one
                
            # table change after all players making actions

            for i in range(4):
                next_states[i] = env.get_state_(i)
                episode_dones[i].append(done)
                episode_states[i].append(this_states[i])
                episode_rewards[i].append(rs[i])
                episode_policies[i].append(policies[i]) # only 1 available action (draw)
        
            ## next step
            for i in range(4):
                this_states[i] = deepcopy(next_states[i])
            
            step += 1
        
#         print("Game {}, step {}".format(n, step))
#         print(env.get_phase_text())
        
        if done:      
            final_score_change = env.get_final_score_change()
            for i in range(4):
                episode_states[i].append(env.get_state_(i))
                
                if len(episode_dones[i]) >= 1: # if not 1st turn end
                    episode_dones[i][-1] = 1
                
                #### Disable the following line if not care others
#                 episode_rewards[i][-1] = final_score_change[i]
                ##################################################
            
            if not np.max(final_score_change) == 0: ## score change
                for i in range(4):
                    agents[i].remember_episode(episode_states[i], episode_rewards[i],
                                               episode_dones[i], episode_policies[i], weight=1)
                print(' ')
                print(env.t.get_result().result_type, end='')
                print(": Totally {} steps".format(np.shape(episode_dones[0])[0]))
                
#                 with open("./Paipu/"+datetime.now().strftime("%Y%m%d-%H%M%S")+".txt", 'w') as fp:
#                     fp.write(mp.GameLogToString(env.t.game_log).decode('GBK'))
#                     break
            else:
                if np.random.rand() < 0.025: ## no score change
                    for i in range(4):
                        agents[i].remember_episode(episode_states[i], episode_rewards[i],
                                                   episode_dones[i], episode_policies[i], weight=1)
                    print(' ')
                    print(env.t.get_result().result_type, end='')
                    print(": Totally {} steps".format(np.shape(episode_dones[0])[0]))
                    
            for n_train in range(5):
                for i in range(4):
                    agents[i].learn(env.symmetric_matrix_features, episode_start=16, logging=True)
            

data = {"rons": env.final_score_changes}
sio.savemat("./final_score_changes" + datetime_str + ".mat", data)


Start!
Model saved in path: ../log/Agent0-20190520-202634-Game0/naiveAI.ckpt
Model saved in path: ../log/Agent1-20190520-202634-Game0/naiveAI.ckpt
Model saved in path: ../log/Agent2-20190520-202634-Game0/naiveAI.ckpt
Model saved in path: ../log/Agent3-20190520-202634-Game0/naiveAI.ckpt
 Game 57 
ResultType.NoTileRyuuKyoku: Totally 118 steps
 Game 61 
ResultType.NoTileRyuuKyoku: Totally 117 steps
 Game 80 
ResultType.RonAgari: Totally 67 steps
 Game 86 
ResultType.NoTileRyuuKyoku: Totally 117 steps
 Game 106 
ResultType.NoTileRyuuKyoku: Totally 114 steps
 Game 160 
ResultType.NoTileRyuuKyoku: Totally 117 steps
 Game 198 
ResultType.NoTileRyuuKyoku: Totally 112 steps
 Game 254 
ResultType.NoTileRyuuKyoku: Totally 118 steps
 Game 299 
ResultType.NoTileRyuuKyoku: Totally 114 steps
 Game 361 
ResultType.NoTileRyuuKyoku: Totally 121 steps
 Game 383 
ResultType.NoTileRyuuKyoku: Totally 116 steps
 Game 412 
ResultType.NoTileRyuuKyoku: Totally 122 steps
 Game 436 
ResultType.NoTileRyuuKyoku: To

In [None]:
## Check tiles
for p in range(4):
    hand = env.t.players[p].hand
    print('player {}'.format(p))
    for k in range(len(hand)):
        print(hand[k].tile)
for p in range(4):
    fulus = env.t.players[p].fulus
    print('player {}'.format(p))
    for k in range(len(fulus)):
        print(fulus[k].to_string())  

In [None]:
print(env.t.get_selected_action_tile())

In [None]:
# yi qi guan tong
hand_matrix_yiqi = \
[[1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],\
\
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0]] 


# pi hu
hand_matrix_pihu = \
[[0, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [0, 0, 0, 0],
 [1, 1, 1, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0]]


# lan pai
hand_matrix_lan = \
[[1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],\
\
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],\
\
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0]] 



# yi qi guan tong 2
hand_matrix_yiqi2 = \
[[0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],\
\
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
\
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0]] 

## Predict score (value function)

In [None]:
# 一气贯通m
print(agents[0].nn.output(np.reshape(hand_matrix_yiqi, [1, 34, 4, 1])))

In [None]:
# 屁胡
print(agents[0].nn.output(np.reshape(hand_matrix_pihu, [1, 34, 4, 1])))

In [None]:
# 不可能听牌的情况
print(agents[0].nn.output(np.reshape(hand_matrix_lan, [1, 34, 4, 1])))

In [None]:
# 一气贯通s
print(agents[0].nn.output(np.reshape(hand_matrix_yiqi2, [1, 34, 4, 1])))