In [1]:
# 計算用ライブラリ mahjong:https://pypi.org/project/mahjong/　
# 参考:https://qiita.com/FJyusk56/items/8189bcca3849532d095f
!pip install mahjong



In [2]:
#計算
from mahjong.hand_calculating.hand import HandCalculator
#麻雀牌
from mahjong.tile import TilesConverter
#役, オプションルール
from mahjong.hand_calculating.hand_config import HandConfig, OptionalRules
#鳴き
from mahjong.meld import Meld
#風(場&自)
from mahjong.constants import EAST, SOUTH, WEST, NORTH
#シャンテン数
from mahjong.shanten import Shanten
# モデルの構築用
import numpy as np

import torch
from torch import nn, optim

import random

# シード値
seed = 1234
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# 入力用に型変換
def convert(tehai):
  tehai.sort()
  man = "".join([str(i) for i in tehai if i < 10])
  pin = "".join([str(i-10) for i in tehai if 10<i and i<20])
  sou = "".join([str(i-20) for i in tehai if 20<i and i<30])
  honors = "".join([str(i-30) for i in tehai if 30<i and i<38])
  return man,pin,sou,honors
# ライブラリを用いたシャンテン数計算(そのままだと数字のリスト、ライブラリに渡すのはstr型)
def calc_shanten(tehai):
  #Shanten(シャンテン数計算用クラス)のインスタンスを生成
  shanten = Shanten()
  man,pin,sou,honors = convert(tehai)
  # 手牌
  tiles = TilesConverter.string_to_34_array(man=man,pin=pin,sou=sou,honors=honors)
  # 計算
  result = shanten.calculate_shanten(tiles)
  return result

# ライブラリを用いた点数計算(鳴き無し,自摸,親,東場想定) 今回不使用
# l_tsumo = last tsumo = アガリ牌
def calc_points(tehai,l_tsumo):
  
  #HandCalculator(計算用クラス)のインスタンスを生成
  calculator = HandCalculator()
  
  # 型変換
  man,pin,sou,honors = convert(tehai)
  tiles = TilesConverter.string_to_136_array(man=man,pin=pin,sou=sou,honors=honors)
  
  # アガリ牌
  if l_tsumo < 10:
    win_tile = TilesConverter.string_to_136_array(man=str(l_tsumo))[0]
  elif l_tsumo < 20:
    win_tile = TilesConverter.string_to_136_array(pin=str(l_tsumo-10))[0]
  elif l_tsumo < 30:
    win_tile = TilesConverter.string_to_136_array(sou=str(l_tsumo-20))[0]
  else:
    win_tile = TilesConverter.string_to_136_array(honors=str(l_tsumo-30))[0]
  
  # 鳴き
  melds = None
  
  # ドラ
  dora_indicators = None
  
  # オプション
  config = HandConfig(is_tsumo=True, player_wind=EAST, round_wind=EAST)

  # 計算
  result = calculator.estimate_hand_value(tiles, win_tile,melds,dora_indicators, config)
  return result

In [3]:
# 萬子=1~9　筒子=11~19　索子=21~29　字牌=31~37(東南西北白発中)　ドラ無し,槓無し想定
class mt:
  def __init__(self):
    # 山を作る
    temp = [x for x in range(1,10)]*4
    self.mt = temp + list(map(lambda x:x+10,temp)) + list(map(lambda x:x+20,temp))
    self.mt += [x+30 for x in range(1,8)]*4
    random.shuffle(self.mt)
  
  def __len__(self):
    return len(self.mt)

  def tsumo(self):
    # 自摸
    return self.mt.pop()
  
  def haipai(self):
    t = []
    for i in range(14):
      t.append(self.tsumo())
    t.sort()
    return t
  
  def yama(self):
    return self.mt

  def reset(self):
    # 山を積む
    temp = [x for x in range(1,10)]*4
    self.mt = temp + list(map(lambda x:x+10,temp)) + list(map(lambda x:x+20,temp))
    self.mt += [x+30 for x in range(1,8)]*4
    random.shuffle(self.mt)

In [4]:
# テスト
m = mt()
tehai = []
for i in range(14):
  tehai.append(m.tsumo())
print(tehai)
tehai.sort()
print(tehai)
print(convert(tehai))
len(m.mt)
s = calc_shanten(tehai)
print(str(s)+"シャンテン")
if s == 0:
  print(calc_points(tehai))


[35, 3, 2, 6, 9, 4, 8, 21, 17, 3, 4, 22, 35, 19]
[2, 3, 3, 4, 4, 6, 8, 9, 17, 19, 21, 22, 35, 35]
('23344689', '79', '12', '55')
2シャンテン


In [5]:
# 
tehai = [3, 3, 5, 6, 7, 13, 14, 15, 19, 19, 23, 24, 25, 31]
calc_shanten(tehai)

0

In [6]:
# テスト用(天和のみアガリ)　ボツ
# while True:
#   m = mt()
#   tehai = m.haipai()
#   if calc_shanten(tehai) == 0:
#     last = tehai[-1]
#     tehai.sort()
#     print(tehai)
#     comment = calc_points(tehai,last)
#     # テンパイとアガリを区別する
#     if comment.cost is None:
#       continue
#     else:
#       print(comment)
#       break

In [7]:
#結果出力用
def print_hand_result(hand_result):
     #翻数, 符数
     print(hand_result.han, hand_result.fu)
     #点数(ツモアガリの場合[左：親失点, 右:子失点], ロンアガリの場合[左:放銃者失点, 右:0])
     print(hand_result.cost['main'], result.cost['additional'])
     #役
     print(hand_result.yaku)
     #符数の詳細
     for fu_item in hand_result.fu_details:
          print(fu_item)
     print('')

In [8]:
tehai = [1,1,1,2,2,2,3,3,3,4,4,4,35,35]
result = calc_points(tehai,35)
print_hand_result(result)
print(result.cost['main'])

26 50
32000 32000
[Suu ankou tanki]
{'fu': 20, 'reason': 'base'}
{'fu': 8, 'reason': 'closed_terminal_pon'}
{'fu': 4, 'reason': 'closed_pon'}
{'fu': 4, 'reason': 'closed_pon'}
{'fu': 4, 'reason': 'closed_pon'}
{'fu': 2, 'reason': 'valued_pair'}
{'fu': 2, 'reason': 'pair_wait'}
{'fu': 2, 'reason': 'tsumo'}

32000


In [9]:
class DQN_Solver(nn.Module):
    def __init__(self):
        super(DQN_Solver, self).__init__()
        # self.obs_size = state_size # list size of state
        # self.action_size = action_size # list size of action
        # self.gamma = 0.9 # discount rate
        self.epsilon = 1.0 # randomness of choosing random action or the best one
        self.e_decay = 0.9999 # epsilon decay rate
        self.e_min = 0.01 # minimum rate of epsilon
        # self.learning_rate = 0.0001 # learning rate of neural network
        
        # 状態価値を求める(近似する)ネットワーク
        # model for neural network(DQNの場合はConvがある)
        # Conv対応の配列に落とし込めた時のたらFlatten()の前に挿入
        self.model_s = nn.Sequential(
            nn.Flatten(),
            nn.Linear(3*4*9, 36),# 捨て牌,手牌,山の三種類でそれぞれに何枚ずつあるかを萬子,筒子,索子,字牌 * 9(字牌は0の部分が2つ存在,最大値=4)
            nn.ReLU(),
            nn.Linear(36, 1)
        )
        
        # advantage〃
        self.model_a = nn.Sequential(
          nn.Flatten(),
          nn.Linear(3*4*9, 36),
          nn.ReLU(),
          nn.Linear(36,14)# 切る牌の候補は14種類
        )
        # self.model.summary() # model summary

    def update_epsilon(self):
        if self.epsilon > self.e_min:
          self.epsilon *= self.e_decay

    def reset_epsilon(self, epsilon = 1.0):
        self.epsilon = epsilon
    
    def invalidate_epsilon(self):
        self.epsilon = 0.0
      

    # # remember state, action, its reward, next state and next possible action. done means boolean for goal
    # def remember_memory(self, state, action, reward, next_state, next_movables, done):
    #     self.memory.append((state, action, reward, next_state, next_movables, done))
    # prioritized experience replay
    # prioritiezed replayの実装のためにdequeから変更
    # PrioritizedReplayBufferクラスに移動済み
    # 経験をリプレイバッファに保存する． 経験は(obs, action, reward, next_obs, done)の5つ組を想定            
    
    # ε-greedy
    def choose_action(self, obs, n_action):
        if self.epsilon >= random.random():
            # randomly choosing action
            return random.randrange(n_action)
        else:
            with torch.no_grad():
            # 作成したネットワークから行動選択
              return torch.argmax(self.forward(obs)).item()

    # ここをKerasからtensorに対応させる
    # choose the best action to maximize reward expectation
    # def choose_best_action(self, state, movables):
    #     best_actions = []
    #     max_act_value = -100
    #     for a in movables:
    #         np_action = np.array([[state, a]])
    #         act_value = self.model.predict(np_action)
    #         if act_value > max_act_value:
    #             best_actions = [a,]
    #             max_act_value = act_value
    #         elif act_value == max_act_value:
    #             best_actions.append(a)
    #     return random.choice(best_actions)
    def forward(self, obs):
        state_values = self.model_s(obs)
        advantage = self.model_a(obs)
       
        # 状態価値 + アドバンテージ で行動価値を計算しますが、安定化のためアドバンテージの（行動間での）平均を引きます
        action_values = state_values + advantage - torch.mean(advantage, dim=1, keepdim=True)
        return action_values
        

    # 9/10はここから
    # this experience replay is going to train the model from memorized states, actions and rewards
    # def replay_experience(self, batch_size):
    #     batch_size = min(batch_size, len(self.memory))
    #     minibatch = random.sample(self.memory, batch_size)
    #     X = []
    #     Y = []
    #     for i in range(batch_size):
    #         obs, action, reward, next_state, next_movables, done = minibatch[i]
    #         input_action = [obs, action]
    #         if done:
    #             target_f = reward
    #         else:
    #             next_rewards = []
    #             for i in next_movables:
    #                 np_next_s_a = np.array([[next_state, i]])
    #                 next_rewards.append(self.model.predict(np_next_s_a))
    #             np_n_r_max = np.amax(np.array(next_rewards))
    #             target_f = reward + self.gamma * np_n_r_max
    #         X.append(input_action)
    #         Y.append(target_f)
    #     np_X = np.array(X)
    #     np_Y = np.array([Y]).T
    #     self.model.fit(np_X, np_Y, epochs=1, verbose=0)
    #     if self.epsilon > self.e_min:
    #         self.epsilon *= self.e_decay

In [10]:
class PrioritizedReplayBuffer(object):
  def __init__(self, buffer_size):
    self.buffer_size = buffer_size
    self.index = 0
    self.buffer = []
    self.priorities = np.zeros(buffer_size, dtype = np.float32)
    self.priorities[0] = 1.0

  def __len__(self):
    return len(self.buffer)
  
  def push(self, experience):
    if len(self.buffer) < self.buffer_size:
        self.buffer.append(experience)
    else:
        self.buffer[self.index] = experience

    # 優先度は最初は大きな値で初期化しておき, 後でサンプルされた時に更新する
    self.priorities[self.index] = self.priorities.max()
    self.index = (self.index + 1) % self.buffer_size

  def sample(self, batch_size, alpha = 0.6, beta = 0.4):
    # 現在経験が入っている部分に対応する優先度を取り出し, サンプルする確率を計算
    priorities = self.priorities[: self.buffer_size if len(self.buffer) == self.buffer_size else self.index]
    priorities = priorities ** alpha
    prob = priorities / priorities.sum()

    # 上で計算した確率に従ってリプレイバッファ中のインデックスをサンプルする
    indices = np.random.choice(len(self.buffer), batch_size, p=prob)

    # 学習の方向性を補正するための重みを計算
    weights = (len(self.buffer) * prob[indices]) ** (-beta)
    weights = weights / weights.max()

    # 上でサンプルしたインデックスに基づいて経験をサンプルし, (obs, action, reward, next_obs, done)に分ける
    obs, action, reward, next_obs, done = zip(*[self.buffer[i] for i in indices])

    # あとで計算しやすいようにtorch.Tensorに変換して(obs, action, reward, next_obs, done, indices, weights)の7つ組を返す
    return (torch.stack(obs),
            torch.as_tensor(action), 
            torch.as_tensor(reward, dtype=torch.float32),
            torch.stack(next_obs), 
            torch.as_tensor(done, dtype=torch.uint8),
            indices,
            torch.as_tensor(weights, dtype=torch.float32))
  def update_priorities(self, indices, priorities):
    self.priorities[indices] = priorities + 1e-4

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [12]:
"""
    リプレイバッファの宣言
"""
buffer_size = 100000  #　リプレイバッファに入る経験の最大数
initial_buffer_size = 10000  # 学習を開始する最低限の経験の数
replay_buffer = PrioritizedReplayBuffer(buffer_size)


"""
    ネットワークの宣言
"""
net = DQN_Solver().to(device)
# net = CNNQNetwork(env.observation_space.shape, n_action=env.action_space.n).to(device)
# target_net = CNNQNetwork(env.observation_space.shape, n_action=env.action_space.n).to(device)
# target_update_interval = 2000  # 学習安定化のために用いるターゲットネットワークの同期間隔


"""
    オプティマイザとロス関数の宣言
"""
optimizer = optim.Adam(net.parameters(), lr=1e-4)  # オプティマイザはAdam
loss_func = nn.SmoothL1Loss(reduction='none')  # ロスはSmoothL1loss（別名Huber loss）- MAEとMSEのいいとこどり


"""
    Prioritized Experience Replayのためのパラメータβ
"""
beta_begin = 0.4
beta_end = 1.0
beta_decay = 500000
# beta_beginから始めてbeta_endまでbeta_decayかけて線形に増やす
beta_func = lambda step: min(beta_end, beta_begin + (beta_end - beta_begin) * (step / beta_decay))


"""
    探索のためのパラメータε
"""
# epsilon_begin = 1.0
# epsilon_end = 0.01
# epsilon_decay = 50000
# # epsilon_beginから始めてepsilon_endまでepsilon_decayかけて線形に減らす
# epsilon_func = lambda step: max(epsilon_end, epsilon_begin - (epsilon_begin - epsilon_end) * (step / epsilon_decay))


"""
    その他のハイパーパラメータ
"""
gamma = 0.99  #　割引率
batch_size = 32
n_episodes = 3000  # 学習を行うエピソード数

In [13]:
def update(batch_size, beta):
  # 今回、観測Oはint型(予定)だが、参考のコードになぞらえる
  obs, action, reward, next_obs, done, indices, weights = replay_buffer.sample(batch_size, beta)
  obs, action, reward, next_obs, done, weights \
      = obs.float().to(device), action.to(device), reward.to(device), next_obs.float().to(device), done.to(device), weights.to(device)
  q_values = net(obs).gather(1, action.unsqueeze(1)).squeeze(1)

  with torch.no_grad():
    greedy_action_next = torch.argmax(net(next_obs), dim=1)
    q_values_next = net(next_obs).gather(1, greedy_action_next.unsqueeze(1)).squeeze(1)

  target_q_values = reward + gamma * q_values_next * (1 - done)
  
  # Prioritized Experience Replayのために, ロスに重み付けを行なって更新します.
  optimizer.zero_grad()
  loss = (weights * loss_func(q_values, target_q_values)).mean()
  loss.backward()
  optimizer.step()

  #　TD誤差に基づいて, サンプルされた経験の優先度を更新します.
  replay_buffer.update_priorities(indices, (target_q_values - q_values).abs().detach().cpu().numpy())

  return loss.item()

In [14]:
def convert_to_obs(tehai, sutehai, yama):
  counter_t = []
  counter_s = []
  counter_y = []
  for i in range(4):
    temp_t = []
    temp_s = []
    temp_y = []
    for j in range(1,10):
      temp_t.append(tehai.count((i * 10) + j))
      temp_s.append(sutehai.count((i * 10) + j))
      temp_y.append(yama.count((i * 10) + j))
    counter_t.append(temp_t)
    counter_s.append(temp_s)
    counter_y.append(temp_y)
  return [counter_t, counter_s, counter_y]

def convert_to_tehai(obs):
  counter_t = obs[0]
  tehai = []
  for i in range(4):
    for j in range(9):
      # if counter_t[i][j] > 0:
      tehai.extend([(i * 10) + (j + 1)] * counter_t[i][j])
  return tehai

In [15]:
# 報酬関数(signal)を定義
# シャン点数が増えたか減ったか変化なしかで判定

In [16]:
import copy
step = 0
env = mt()
# テンパイ回数
num_T = 0
for episode in range(n_episodes):
  total_reward = 0
  tehai = env.haipai()
  sutehai = []
  yama = env.yama()
  obs = convert_to_obs(tehai, sutehai, yama)
  done = False
  # doneの判定作る、天和が問題?->そのままdoneとして環境を引き継ぐ?
  while not done:
    # 左から何番目の牌を捨てるか
    action = net.choose_action(torch.reshape(torch.as_tensor(obs).float().to(device),(-1,3,4,9)), 14)
    # 手牌のlistに変換して
    tehai = convert_to_tehai(obs)
    # 元のシャンテン数を計算し
    shanten_b = calc_shanten(tehai)
    # action(牌を捨てる)
    sutehai.append(tehai.pop(action))
    # 自摸をして
    tehai.append(env.tsumo())
    # 行動した結果シャンテン数が増えたか減ったかを判定し報酬とする
    shanten_a = calc_shanten(tehai) # シャンテン数は1以上増減することはないのでちょうどよさそう?
    # シャンテン数は減ったほうがいいので前から後を引く
    reward = shanten_b - shanten_a
    total_reward += reward
    yama = env.yama()
    # obsに変換
    next_obs = convert_to_obs(tehai, sutehai, yama)
    # テンパイなら終わり
    if shanten_a == 0:
      done = True
      num_T += 1
      # 配牌(14) + 自摸(1) = 15ができるだけの牌が山にない
      if len(env) < 15:
        env.reset()
    # 自摸ができない
    if len(env) == 0:
      done = True
      env.reset()
    # リプレイバッファに経験を蓄積
    replay_buffer.push([torch.as_tensor(obs), action, reward, torch.as_tensor(next_obs), done])
    obs = next_obs
    # ネットワークを更新            
    if len(replay_buffer) > initial_buffer_size:
        update(batch_size, beta_func(step))
    step += 1
    net.update_epsilon()
  print('Episode: {},  Step: {},  Reward: {}, L_shanten: {}シャンテン'.format(episode + 1, step + 1, total_reward, shanten_a))
print('自摸回数{}回で{}回テンパイ'.format(step, num_T))
  
# 入力の時にアガリかどうかを判定して頑張る

Episode: 1,  Step: 123,  Reward: 3, L_shanten: 2シャンテン
Episode: 2,  Step: 245,  Reward: 1, L_shanten: 3シャンテン
Episode: 3,  Step: 367,  Reward: 1, L_shanten: 2シャンテン
Episode: 4,  Step: 489,  Reward: 1, L_shanten: 2シャンテン
Episode: 5,  Step: 611,  Reward: 0, L_shanten: 4シャンテン
Episode: 6,  Step: 733,  Reward: -2, L_shanten: 4シャンテン
Episode: 7,  Step: 855,  Reward: -1, L_shanten: 2シャンテン
Episode: 8,  Step: 977,  Reward: 1, L_shanten: 3シャンテン
Episode: 9,  Step: 1099,  Reward: 1, L_shanten: 3シャンテン
Episode: 10,  Step: 1221,  Reward: 2, L_shanten: 2シャンテン
Episode: 11,  Step: 1343,  Reward: 1, L_shanten: 2シャンテン
Episode: 12,  Step: 1465,  Reward: 0, L_shanten: 4シャンテン
Episode: 13,  Step: 1587,  Reward: -1, L_shanten: 4シャンテン
Episode: 14,  Step: 1709,  Reward: 0, L_shanten: 3シャンテン
Episode: 15,  Step: 1831,  Reward: -1, L_shanten: 4シャンテン
Episode: 16,  Step: 1953,  Reward: 0, L_shanten: 3シャンテン
Episode: 17,  Step: 2075,  Reward: 2, L_shanten: 2シャンテン
Episode: 18,  Step: 2197,  Reward: 0, L_shanten: 4シャンテン
Episo

In [17]:
step = 0
num_T = 0
env.reset()
for episode in range(n_episodes):
  total_reward = 0
  tehai = env.haipai()
  sutehai = []
  yama = env.yama()
  obs = convert_to_obs(tehai, sutehai, yama)
  done = False
  # doneの判定作る、天和が問題?->そのままdoneとして環境を引き継ぐ?
  while not done:
    # 左から何番目の牌を捨てるか
    action = random.randrange(14)
    # listに変換して
    tehai = convert_to_tehai(obs)
    # 元のシャンテン数を計算し
    shanten_b = calc_shanten(tehai)
    # action(牌を捨てる)
    sutehai.append(tehai.pop(action))
    # 自摸をして
    tehai.append(env.tsumo())
    # 行動した結果シャンテン数が増えたか減ったかを判定し報酬とする
    shanten_a = calc_shanten(tehai) # シャンテン数は1以上増減することはないのでちょうどよさそう?
    # シャンテン数は減ったほうがいいので前から後を引く
    reward = shanten_b - shanten_a
    total_reward += reward
    yama = env.yama()
    # obsに変換
    next_obs = convert_to_obs(tehai, sutehai, yama)
    obs = next_obs
    # テンパイなら終わり
    if shanten_a == 0:
      done = True
      num_T += 1
      # 配牌(14) + 自摸(1) = 15ができるだけの牌が山にない
      if len(env) < 15:
        env.reset()
    # 自摸ができない
    if len(env) == 0:
      done = True
      env.reset()
    step += 1
  print('Episode: {},  Step: {},  Reward: {}, L_shanten: {}シャンテン'.format(episode + 1, step + 1, total_reward, shanten_a))
print('自摸回数{}回で{}回テンパイ'.format(step, num_T))
  

Episode: 1,  Step: 123,  Reward: 2, L_shanten: 1シャンテン
Episode: 2,  Step: 245,  Reward: -1, L_shanten: 4シャンテン
Episode: 3,  Step: 367,  Reward: 0, L_shanten: 2シャンテン
Episode: 4,  Step: 489,  Reward: 0, L_shanten: 3シャンテン
Episode: 5,  Step: 611,  Reward: 0, L_shanten: 4シャンテン
Episode: 6,  Step: 733,  Reward: 2, L_shanten: 1シャンテン
Episode: 7,  Step: 855,  Reward: 0, L_shanten: 2シャンテン
Episode: 8,  Step: 977,  Reward: 0, L_shanten: 4シャンテン
Episode: 9,  Step: 1099,  Reward: 0, L_shanten: 3シャンテン
Episode: 10,  Step: 1221,  Reward: 0, L_shanten: 3シャンテン
Episode: 11,  Step: 1343,  Reward: 0, L_shanten: 4シャンテン
Episode: 12,  Step: 1465,  Reward: -2, L_shanten: 4シャンテン
Episode: 13,  Step: 1587,  Reward: 1, L_shanten: 3シャンテン
Episode: 14,  Step: 1709,  Reward: -1, L_shanten: 4シャンテン
Episode: 15,  Step: 1831,  Reward: 1, L_shanten: 3シャンテン
Episode: 16,  Step: 1953,  Reward: 0, L_shanten: 2シャンテン
Episode: 17,  Step: 2075,  Reward: 1, L_shanten: 3シャンテン
Episode: 18,  Step: 2197,  Reward: 0, L_shanten: 3シャンテン
Episod

In [28]:
# 2m3m4m 6m7m8m 2p3p4p 2s3s4s 6s7s
tehai = [2,3,4,6,7,8,12,13,14,22,23,24,26,27]
sutehai = []
yama = [x for x in range(1,38) if not x % 10 == 0]*3
yama2 = [x for x in range(1,38) if not (x % 10 == 0 or x in tehai)]
yama.extend(yama2)
convert_to_obs(tehai,sutehai,yama)
ep = net.epsilon
net.invalidate_epsilon()
action = net.choose_action(torch.reshape(torch.as_tensor(obs).float().to(device), (-1,3,4,9)), 14)
tehai.pop(action)
print(tehai)
net.reset_epsilon(ep)

[2, 3, 4, 6, 7, 8, 12, 13, 14, 22, 23, 24, 27]


In [19]:
# 4m5m6m8m 3p4p5p7p 2s2s2s 東東東
tehai = [4,5,6,8,13,14,15,17,22,22,22,31,31,31]
sutehai = []
yama = [x for x in range(1,38) if not x % 10 == 0]*3
yama2 = [x for x in range(1,38) if not (x % 10 == 0 or x in tehai)]
yama.extend(yama2)
convert_to_obs(tehai,sutehai,yama)
ep = net.epsilon
net.invalidate_epsilon()
action = net.choose_action(torch.reshape(torch.as_tensor(obs).float().to(device), (-1,3,4,9)), 14)
tehai.pop(action)
print(tehai)
net.reset_epsilon(ep)

[4, 5, 6, 8, 13, 14, 15, 17, 22, 22, 22, 31, 31]


In [20]:
# 2m3m4m 5m6m 2p3p4p 7p9p 2s3s4s5s
tehai = [2,3,4,5,6,12,13,14,17,19,22,23,24,25]
sutehai = []
yama = [x for x in range(1,38) if not x % 10 == 0]*3
yama2 = [x for x in range(1,38) if not (x % 10 == 0 or x in tehai)]
yama.extend(yama2)
convert_to_obs(tehai,sutehai,yama)
ep = net.epsilon
net.invalidate_epsilon()
action = net.choose_action(torch.reshape(torch.as_tensor(obs).float().to(device), (-1,3,4,9)), 14)
tehai.pop(action)
print(tehai)
net.reset_epsilon(ep)

[2, 3, 4, 5, 6, 12, 13, 14, 17, 19, 22, 23, 25]


### 参考サイト一覧
麻雀AI作成 step 1 Pythonで一人麻雀ができるコードを書いてみた。https://qiita.com/minowa/items/107a5a0f3e0d1fe04ef6  
DQNで自作迷路を解くhttps://qiita.com/cvusk/items/e4f5862574c25649377a  
RL-card https://github.com/datamllab/rlcard  
Pythonライブラリの「麻雀(mahjong)」って？？
https://qiita.com/FJyusk56/items/8189bcca3849532d095f
