In [1]:
from aiFrost2 import AgentFrost2, MahjongNetFrost2
import numpy as np
from copy import deepcopy
from buffer import MahjongBufferFrost2
import MahjongPy as mp
from wrapper import EnvMahjong2
import scipy.io as sio
from datetime import datetime
import time

now = datetime.now()
datetime_str = now.strftime("%Y%m%d-%H%M%S")

env = EnvMahjong2()

num_tile_type = env.matrix_feature_size[0]
num_each_tile = env.matrix_feature_size[1]
num_vf = env.vector_feature_size

memories = [MahjongBufferFrost2(size=4096, num_tile_type=num_tile_type, num_each_tile=num_each_tile,
                                num_vf=num_vf) for i in range(4)]

episode_start = 256
episode_savebuffer = 128
mu_size = 40
max_steps = memories[0].episode_length

In [9]:
n_games = 100000

process_time = 0
learn_time = 0 
select_time = 0
all_time = 0
play_time = 0
response_time = 0
copy_time = 0
done_time = 0

print("Start!")

for n in range(n_games):
    
    st_all =  time.time()
    
#     if n % 10000 == 0:
#         for i in range(4):
#             agents[i].nn.save(model_dir= "Agent{}-".format(i) + datetime_str + "-Game{}".format(n))  # save network parameters every 10000 episodes
    
    for i in range(4):
        if memories[i].tail % episode_savebuffer == 0:
            memories[i].save("./buffer/Agent{}-".format(i) + "MahjongBufferFrost2_RanHoraPolicy_" + datetime_str + ".pkl")
    
    print("\r Game {}".format(n), end='')

    episode_dones = np.zeros([4, max_steps], dtype=np.float16)
    episode_matrix_features = np.zeros([4, max_steps, num_tile_type, num_each_tile], dtype=np.float16)
    episode_vector_features = np.zeros([4, max_steps, num_vf], dtype=np.float16)
    episode_rewards = np.zeros([4, max_steps], dtype=np.float32)
    episode_actions = np.zeros([4, max_steps], dtype=np.int32)
    episode_policies = np.zeros([4, max_steps, mu_size], dtype=np.float32)

    done = 0
#     policies = np.zeros([4,], dtype=np.int32)
    actions = np.zeros([4,], dtype=np.int32)
    rs = np.zeros([4,], dtype=np.float32)
    
    this_states = env.reset()  ## for all players
    
    next_aval_states = deepcopy(this_states)
    next_states = [[], [], [], []]
    
    step = 0
    agent_step = [0, 0, 0, 0]
    
    while not done and step < 10000:

        who, what = env.who_do_what()
        
        st_play = time.time()
        ## make selection
        if what == "play":
            
            ######################## Draw a tile #####################
            
            next_states[who], r, done, _ = env.step_draw(playerNo=who)
            
            episode_dones[who, agent_step[who]] = done
            episode_matrix_features[who, agent_step[who]] = this_states[who][0]
            episode_vector_features[who, agent_step[who]] = this_states[who][1]
            episode_rewards[who, agent_step[who]] = r
            episode_actions[who, agent_step[who]] = 0
            policy = np.zeros([mu_size, ], dtype=np.float32)
            policy[0] += 1.
            episode_policies[who, agent_step[who]] = policy  # only 1 available action (draw)

            agent_step[who] += 1
            
            this_states[who] = deepcopy(next_states[who])
            
            ###################### Play a tile #######################
            ###### 能和则和，能立直则立直 ############
            aval_actions = env.t.get_self_actions()
            good_actions = []
            
            for a in range(len(aval_actions)):
                if aval_actions[a].action == mp.Action.Riichi:
                    good_actions.append(a)

                if aval_actions[a].action == mp.Action.Tsumo:
                    good_actions.append(a)
            #######################################
#             st_process = time.time()

#             next_aval_states = env.get_aval_next_states(who)  ## for a single player
            
#             et_process = time.time()
#             process_time += et_process - st_process     
            
            st = time.time()
            if len(good_actions) > 0:
                good_actions = np.reshape(good_actions, [-1, ])
                
                a_in_good_as = np.random.choice(len(good_actions))
                policy = np.ones(len(good_actions), dtype=np.float32) / len(good_actions)
                
                action = good_actions[a_in_good_as]
                tmp = np.zeros([mu_size,], dtype=np.float32)
                tmp[good_actions] = policy
                policy = deepcopy(tmp)
            else:
                action = np.random.choice(len(aval_actions))
                policy = np.ones(len(aval_actions), dtype=np.float32) / len(aval_actions)
                # covert policy to vector (with padding)
                tmp = np.zeros([mu_size,], dtype=np.float32)
                tmp[:np.shape(policy)[0]] = policy
                policy = deepcopy(tmp)
            
            et = time.time()
            select_time += et - st
            
            next_states[who], r, done, _ = env.step_play(action, playerNo=who)
            
            next_states[who] = env.get_state_(who)
            
            episode_dones[who, agent_step[who]] = done
            episode_matrix_features[who, agent_step[who]] = this_states[who][0]
            episode_vector_features[who, agent_step[who]] = this_states[who][1]
            episode_rewards[who, agent_step[who]] = r
            episode_actions[who, agent_step[who]] = action
            episode_policies[who, agent_step[who]] = policy
            agent_step[who] += 1
            
            this_states[who] = deepcopy(next_states[who])
            et_play = time.time()
            play_time += et_play - st_play
#             step += 2
        
        st_response = time.time()
        if what == "response":
            policies = [np.zeros([mu_size,], dtype=np.float32) for _ in range(4)]
            for i in range(4):
                
#                 st_process = time.time()
#                 next_aval_states = env.get_aval_next_states(i)
#                 et_process = time.time()
#                 process_time += et_process - st_process
                
                ######################## 能和则和，能立直则立直 ##############
                aval_actions = env.t.get_response_actions()
                good_actions = []
                
                for a in range(len(aval_actions)):
                    if aval_actions[a].action == mp.Action.Ron:
                        good_actions.append(a)

                    if aval_actions[a].action == mp.Action.ChanKan:
                        good_actions.append(a)

                    if aval_actions[a].action == mp.Action.ChanAnKan:
                        good_actions.append(a)
                ##########################################################
                st = time.time()
                if len(good_actions) > 0:
                    good_actions = np.reshape(good_actions, [-1, ])
                    a_in_good_as = np.random.choice(len(good_actions))
                    policies[i] = np.ones(len(good_actions), dtype=np.float32) / len(good_actions)
                    actions[i] = good_actions[a_in_good_as]
                    # covert policy to vector (with padding)
                    tmp = np.zeros([mu_size,], dtype=np.float32)
                    tmp[good_actions] = policies[i]
                    policies[i] = deepcopy(tmp)
                    
                else:
                    actions[i] = np.random.choice(len(aval_actions))
                    policies[i] = np.ones(len(aval_actions), dtype=np.float32) / len(aval_actions)
                    # covert policy to vector (with padding)
                    tmp = np.zeros([mu_size,], dtype=np.float32)
                    tmp[:np.shape(policies[i])[0]] = policies[i]
                    policies[i] = deepcopy(tmp)
                
                et = time.time()
                select_time += et - st
                next_states[i], rs[i], done, _ = env.step_response(actions[i], playerNo=i)
                
                ## Note: next_states is agent's prediction, but not the true one
                
            # table change after all players making actions

            for i in range(4):
                episode_dones[i, agent_step[i]] = done
                episode_matrix_features[i, agent_step[i]] = this_states[i][0]
                episode_vector_features[i, agent_step[i]] = this_states[i][1]
                episode_rewards[i, agent_step[i]] = rs[i]
                episode_actions[i, agent_step[i]] = actions[i]
                episode_policies[i, agent_step[i]] = policies[i]
                agent_step[i] += 1
                
            ## next step
            st_copy = time.time()
            for i in range(4):
                this_states[i] = deepcopy(next_states[i])
            et_copy = time.time()
            copy_time += et_copy - st_copy
            
            step += 1
        et_response = time.time()
        response_time += et_response - st_response
#         print("Game {}, step {}".format(n, step))
#         print(env.get_phase_text())
        
        if done:      
            st_done = time.time()
            final_score_change = env.get_final_score_change()
            for i in range(4):
                current_state = env.get_state_(i)
                episode_matrix_features[i, agent_step[i]] = current_state[0]
                episode_vector_features[i, agent_step[i]] = current_state[1]
                
                if len(episode_dones[i]) >= 1: # if not 1st turn end
                    episode_dones[i][-1] = 1
                
                #### Disable the following line if not care others
#                 episode_rewards[i][-1] = final_score_change[i]
                ##################################################
            
            if not np.max(final_score_change) == 0: ## score change
                for i in range(4):
                    memories[i].append_episode(episode_matrix_features[i, 0: agent_step[i]],
                                               episode_vector_features[i, 0: agent_step[i]],
                                               episode_rewards[i, 0: agent_step[i]],
                                               episode_dones[i, 0: agent_step[i]],
                                               episode_actions[i, 0: agent_step[i]],
                                               episode_policies[i, 0: agent_step[i]],
                                               weight=0)
#                     agents[i].remember_episode(episode_states[i], episode_rewards[i],
#                                                episode_dones[i], episode_policies[i], weight=1)
                print(' ')
                print(env.t.get_result().result_type, end='')
                print(": Totally {} steps".format(np.shape(episode_dones[0])[0]))
                
#                 with open("./Paipu/"+datetime.now().strftime("%Y%m%d-%H%M%S")+".txt", 'w+') as fp:
#                     fp.write(mp.GameLogToString(env.t.game_log).decode('GBK'))
#                     break
            else:
                if np.random.rand() < 0.005: ## no score change
                    for i in range(4):
                        memories[i].append_episode(episode_matrix_features[i, 0: agent_step[i]],
                                                   episode_vector_features[i, 0: agent_step[i]],
                                                   episode_rewards[i, 0: agent_step[i]],
                                                   episode_dones[i, 0: agent_step[i]],
                                                   episode_actions[i, 0: agent_step[i]],
                                                   episode_policies[i, 0: agent_step[i]],
                                                   weight=0)
                    print(' ')
                    print(env.t.get_result().result_type, end='')
                    print(": Totally {} steps".format(np.shape(episode_dones[0])[0]))
            
#             st = time.time()
#             for n_train in range(5):
#                 for i in range(4):
#                     agents[i].learn(env.symmetric_matrix_features, episode_start=episode_start, logging=True)
#             et = time.time()
#             learn2_time += et - st
            
#             et_done = time.time()
#             done_time += et_done - st_done
            
            et_all = time.time()
            all_time += et_all - st_all

# data = {"rons": env.final_score_changes}
# sio.savemat("./final_score_changes" + datetime_str + ".mat", data)


Start!
 Game 44 
ResultType.TsumoAgari: Totally 150 steps
 Game 135 
ResultType.TsumoAgari: Totally 150 steps
 Game 192 
ResultType.RonAgari: Totally 150 steps
 Game 270 
ResultType.NoTileRyuuKyoku: Totally 150 steps
 Game 289 
ResultType.RonAgari: Totally 150 steps
 Game 342 
ResultType.NoTileRyuuKyoku: Totally 150 steps
 Game 366 
ResultType.RonAgari: Totally 150 steps
 Game 380 
ResultType.NoTileRyuuKyoku: Totally 150 steps
 Game 401 
ResultType.RonAgari: Totally 150 steps
 Game 409 
ResultType.NoTileRyuuKyoku: Totally 150 steps
 Game 509 
ResultType.TsumoAgari: Totally 150 steps
 Game 610 
ResultType.NoTileRyuuKyoku: Totally 150 steps
 Game 648 
ResultType.TsumoAgari: Totally 150 steps
 Game 801 
ResultType.RonAgari: Totally 150 steps
 Game 826 
ResultType.NoTileRyuuKyoku: Totally 150 steps
 Game 1063 
ResultType.RonAgari: Totally 150 steps
 Game 1201 
ResultType.RonAgari: Totally 150 steps
 Game 1224 
ResultType.TsumoAgari: Totally 150 steps
 Game 1432 
ResultType.NoTileRyuuKyoku:

RuntimeError: FILE:..\..\Mahjong\ScoreCounter.cpp LINE:408 FUNC:CounterResult::calculate_score MSG:ss.str().c_str()

In [None]:
memories[1].filled_size

In [None]:
for i in range(4):
    memories[i].save("./buffer/Agent{}-".format(i) + "MahjongBufferFrost2_RanHoraPolicy_228"  + ".npz")
    

In [13]:
i

3

In [12]:
## Check tiles
for p in range(4):
    hand = env.t.players[p].hand
    print('player {}'.format(p))
    for k in range(len(hand)):
        print(hand[k].tile)
for p in range(4):
    fulus = env.t.players[p].fulus
    print('player {}'.format(p))
    for k in range(len(fulus)):
        print(fulus[k].to_string())  

player 0
BaseTile._2m
BaseTile._9m
BaseTile._9m
BaseTile._1s
BaseTile._6s
BaseTile._9s
BaseTile._5p
BaseTile._5p
BaseTile._9p
BaseTile.west
player 1
BaseTile._2m
BaseTile._8m
BaseTile._9m
BaseTile._1s
BaseTile._1s
BaseTile._3s
BaseTile._2p
BaseTile._2p
BaseTile._3p
BaseTile.hatsu
player 2
BaseTile._3s
BaseTile._3s
BaseTile._6p
BaseTile._8p
player 3
BaseTile._1m
BaseTile._2m
BaseTile._3m
BaseTile._5m
BaseTile._9m
BaseTile._5s
BaseTile.east
BaseTile._7p
player 0
[1p][2p]([3p])
player 1
[8s]([8s])[8s]
player 2
[1p][2p]([3p])
([7s])[8s][9s]
[4m]([4m])[4m]
player 3
([4p])[5p][6p]
[7p]([7p])[7p]


In [None]:
env.t.DORA[0].tile

In [None]:
this_states[0][0][:,28]

In [None]:
import matplotlib.pyplot as plt
for i in range(4):
    plt.pcolor(env.get_state_(i)[0])
    plt.colorbar()
    plt.show()
    


In [None]:
for i in range(1):
    plt.pcolor(env.get_next_state(0, i)[0])
    print(env.t.get_response_actions()[0].action)
    plt.colorbar()
    plt.show()

## Predict score (value function)

In [11]:
with open("./Paipu/"+"bug_paipu"+".txt", 'w+') as fp:
    fp.write(mp.GameLogToString(env.t.game_log).decode('GBK'))


RuntimeError: FILE:..\..\Mahjong\GameLog.cpp LINE:58 FUNC:BaseGameLog::to_string MSG:"Invalid LogAction. Action: " + std::to_string(int(action))

In [None]:
mp.GameLogToString(env.t.game_log).decode('GBK')

In [None]:
env.t.get_self_actions()