p4s14406789z1357の場合、ピンズ、ソウズ、字牌になってる。
数牌の場合:
    通常の牌: 1~9 赤５牌: 0
字牌の場合:
    東南西北: 1~4 白発中: 567
開始情報の読み込み(試験用)

In [17]:
import sys
import re
import xml.etree.ElementTree as ET
import pandas as pd
from urllib.parse import unquote

# 段位の数値から名称へのマッピング (0: 新人 ～ 20: 天鳳位)
RANK_NAMES = [
    '新人','9級','8級','7級','6級','5級','4級','3級','2級','1級',
    '初段','二段','三段','四段','五段','六段','七段','八段','九段','十段',
    '天鳳位'
]

# seed の局順を人が読みやすい文字列にマッピング
ROUND_MAP = {
    0: '東一局', 1: '東二局', 2: '東三局',
    4: '南一局', 5: '南二局', 6: '南三局',
    8: '西一局', 9: '西二局', 10: '西三局'
}

# GO タグの type 属性ビットフラグ解析
def parse_table_type(type_val: int) -> (str, bool):
    """
    type_val のビットフラグから
      ・ルール文字列を生成して返す
      ・hongpai_flag (赤牌あり) を bool で返す
    """
    flags = {
        'demo':     not (type_val & 0x0001),
        'hongpai':  not (type_val & 0x0002),
        'ariari':   not (type_val & 0x0004),
        'dongfeng': not (type_val & 0x0008),
        'sanma':    bool(type_val & 0x0010),
        'soku':     bool(type_val & 0x0040)
    }

    # 三麻でない（＝四麻）の場合は処理を中止
    if not flags['sanma']:
        print("四麻は読み込めません。処理を中止します。")
        sys.exit(1)

    level = ((type_val & 0x0020) >> 4) | ((type_val & 0x0080) >> 7)
    parts = ['三', ['般','上','特','鳳'][level], '東' if flags['dongfeng'] else '南']
    if flags['ariari']:
        parts.append('喰')
    if flags['hongpai']:
        parts.append('赤')
    if flags['soku']:
        parts.append('速')

    return ''.join(parts), flags['hongpai']


def pai(pai_nums: list[int], hongpai: bool) -> str:
    """
    牌番号リスト pai_nums を電脳麻将形式に変換して返す。
    - pai_nums: 0～135 の整数リスト
    - hongpai: 赤牌ありかどうか
    ルール:
      suit = floor(n/36) -> 0:m, 1:p, 2:s, 3:z
      rank = int((n % 36)/4) + 1
      if hongpai and suit != 'z' and rank == 5 and n % 4 == 0: rank = 0
    """
    if not pai_nums:
        return None

    parts = []
    sorted_nums = sorted(pai_nums)
    suit_labels = ['m', 'p', 's', 'z']
    prev_suit = None

    for n in sorted_nums:
        suit = suit_labels[n // 36]
        rank = (n % 36) // 4 + 1
        # 赤牌の処理
        if hongpai and suit != 'z' and rank == 5 and (n % 4 == 0):
            rank = 0
        if suit != prev_suit:
            parts.append(suit)
            prev_suit = suit
        parts.append(str(rank))

    return ''.join(parts)


def parse_un_from_file(file_path):
    """
    指定したXMLファイルをパースし、DataFrame 用のレコードリストを返す。
    - ファイル名から 日付・時間 を抽出
    - GO タグから type を解析し (ルール文字列, hongpai_flag) を取得
    - TAIKYOKU タグから oya インデックスを取得し、oya != 0（東家以外）は中止
    - INIT タグから
        ・seed → [局順, 本場, 供託, サイコロ1, サイコロ2, ドラ表示牌] に分割し、
          - 局順 → '局' 列（例: '東一局' など）
          - 本場   → '本場' 列（整数）
          - 供託   → '供託' 列（整数）
          - サイコロ1,2 → +1 して [1～6] にしたリストを 'サイコロ' 列
          - ドラ表示牌 → 牌番号を int に変換し、後で pai() により 'ドラ表示牌' 列に電脳麻将形式で格納
        ・hai0,hai1,hai2 はそれぞれ '東家_手牌', '南家_手牌', '西家_手牌' に電脳麻将形式で格納
    - 各 <UN> タグからプレイヤー情報（n0～n2）をURLデコードし、
      座席ラベル（東家/南家/西家）の情報を取得。段位・レートも列として作成。
    """
    # ファイル名抽出と 日付/時間
    file_name = file_path.split('/')[-1]
    m = re.match(
        r"^tenhou_haifu_(\d{4})_(\d{2})_(\d{2})_(\d{2})_(\d{2})_\d{2}\.xml$",
        file_name
    )
    date_str = f"{m.group(1)}-{m.group(2)}-{m.group(3)}" if m else None
    time_str = f"{m.group(4)}:{m.group(5)}" if m else None

    # XMLパース
    tree = ET.parse(file_path)
    root = tree.getroot()

    # GOタグ解析 → ルール文字列, hongpai_flag を取得
    go_elem = root.find('.//GO')
    if go_elem is not None and 'type' in go_elem.attrib:
        try:
            table_type_str, hongpai_flag = parse_table_type(int(go_elem.attrib['type']))
        except ValueError:
            table_type_str, hongpai_flag = None, False
    else:
        table_type_str, hongpai_flag = None, False

    # TAIKYOKUタグ解析 → oyaインデックス取得
    taikyoku_elem = root.find('.//TAIKYOKU')
    if taikyoku_elem is not None and 'oya' in taikyoku_elem.attrib:
        try:
            oya_idx = int(taikyoku_elem.attrib['oya'])
        except ValueError:
            oya_idx = None
    else:
        oya_idx = None

    # oya が 0（東家）でない場合は中止
    if oya_idx is None or oya_idx != 0:
        print("TAIKYOKU の oya が東家（0）ではないため読み込めません。処理を中止します。")
        sys.exit(1)

    # INITタグ解析
    init_elem = root.find('.//INIT')
    if init_elem is not None:
        # seed をカンマで分割 (6つの要素を想定)
        seed_parts = init_elem.attrib.get('seed', '').split(',')
        try:
            seed_ints = [int(x) for x in seed_parts]
        except:
            seed_ints = []

        # “局順”(seed_ints[0]) → ROUND_MAP で文字列化 → '局' 列
        round_val = seed_ints[0] if len(seed_ints) > 0 else None
        round_name = ROUND_MAP.get(round_val)

        # “本場”(seed_ints[1]) → '本場' 列
        honba = seed_ints[1] if len(seed_ints) > 1 else None

        # “供託”(seed_ints[2]) → '供託' 列
        kyoutaku = seed_ints[2] if len(seed_ints) > 2 else None

        # “サイコロ”(seed_ints[3], seed_ints[4]) → +1 して [1～6] へ → 'サイコロ' 列
        if len(seed_ints) > 4:
            dice1 = seed_ints[3] + 1
            dice2 = seed_ints[4] + 1
            dice_list = [dice1, dice2]
        else:
            dice_list = None

        # “ドラ表示牌”(seed_ints[5]) → int 型にして後で変換。'ドラ表示牌番号'
        dorapawn_num = seed_ints[5] if len(seed_ints) > 5 else None

        # “ten” をカンマで分割 (4 要素分) → 各点数を 100 倍
        ten_parts = init_elem.attrib.get('ten', '').split(',')
        try:
            ten_ints = [int(x) for x in ten_parts]
        except:
            ten_ints = []

        # “oya”（INIT 内）も int 変換しておく (0～2 想定)
        try:
            oya_init = int(init_elem.attrib.get('oya', -1))
        except ValueError:
            oya_init = None

        # “hai0～hai3” を文字列で取得
        hai_parts = [init_elem.attrib.get(f'hai{i}', '') for i in range(4)]
    else:
        round_name = None
        honba = None
        kyoutaku = None
        dice_list = None
        dorapawn_num = None
        ten_ints = []
        oya_init = None
        hai_parts = ['', '', '', '']

    # 「東家/南家/西家」ラベル（三麻なので3人）
    seat_labels = ['東家', '南家', '西家']

    # INIT の oya を日本語ラベルに変換 (0→東家,1→南家,2→西家)
    if oya_init is not None and 0 <= oya_init < 3:
        parent_label = seat_labels[oya_init]
    else:
        parent_label = None

    # ドラ表示牌を電脳麻将形式に変換 (hongpai_flag を渡す)
    if dorapawn_num is not None:
        dora_pai_str = pai([dorapawn_num], hongpai_flag)
    else:
        dora_pai_str = None

    records = []
    for un in root.findall('.//UN'):
        attrib = un.attrib
        # n0～n2 の名前を URL デコード（存在しない場合は None）
        names = [unquote(attrib.get(f'n{i}', '')) or None for i in range(3)]
        dan_list = attrib.get('dan', '').split(',')
        rate_list = attrib.get('rate', '').split(',')

        # 基本情報レコード
        rec = {
            'ファイル名': file_name,
            '日付': date_str,
            '時間': time_str,
            '卓情報': table_type_str,

            # INIT の seed 情報
            '局': round_name,      # 例: '東一局' ～ '西三局'
            '本場': honba,
            '供託': kyoutaku,
            'サイコロ': dice_list, # 例: [3, 5]
            'ドラ表示牌': dora_pai_str,

            # INIT の oya を「親」列に
            '親': parent_label
        }

        # INIT の ten (各 4 要素) を 100 倍して「東家_点数」などに格納
        for i, seat in enumerate(seat_labels):
            try:
                score = ten_ints[i] * 100
            except Exception:
                score = None
            rec[f'{seat}_点数'] = score

        # INIT の hai0～hai2 を各座席の「○家_手牌」列に電脳麻将形式で格納
        # (hai3 は無視)
        for i, seat in enumerate(seat_labels):
            raw_hand = hai_parts[i]  # 例: "71,85,48,..."
            if raw_hand:
                nums = [int(x) for x in raw_hand.split(',') if x.strip() != '']
                rec[f'{seat}_手牌'] = pai(nums, hongpai_flag)
            else:
                rec[f'{seat}_手牌'] = None

        # 各 UN (n0～n2) の情報を「東家/南家/西家」として格納
        for i, seat in enumerate(seat_labels):
            rec[seat] = names[i]
            try:
                d = int(dan_list[i])
                rec[f'{seat}_段位'] = RANK_NAMES[d] if 0 <= d < len(RANK_NAMES) else None
            except:
                rec[f'{seat}_段位'] = None

            try:
                rec[f'{seat}_レート'] = int(rate_list[i])
            except:
                rec[f'{seat}_レート'] = None

        records.append(rec)

    return records


def build_dataframe_from_file(file_path):
    data = parse_un_from_file(file_path)
    df = pd.DataFrame(data)
    # '日付' を datetime 型に変換
    if '日付' in df.columns:
        df['日付'] = pd.to_datetime(df['日付'], format='%Y-%m-%d', errors='coerce')
    return df


if __name__ == '__main__':
    xml_file = '../../data/source_data/tenhou_source_data/tenhou_haifu/2009/tenhou_haifu_2009_02_20_17_10_72.xml'
    df = build_dataframe_from_file(xml_file)

    # 全列表示の設定
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 200)

    display(df.head())



Unnamed: 0,ファイル名,日付,時間,卓情報,局,本場,供託,サイコロ,ドラ表示牌,親,東家_点数,南家_点数,西家_点数,東家_手牌,南家_手牌,西家_手牌,東家,東家_段位,東家_レート,南家,南家_段位,南家_レート,西家,西家_段位,西家_レート
0,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,東一局,0,0,"[5, 4]",z1,東家,35000,35000,35000,p4s14406789z1357,m99p223067s129z55,p3458s34778z2246,僕、おおくぽん,八段,2182,俊ころ,七段,2049,楠下幾太郎,七段,2097


開始情報の読み込み(完成版)

In [20]:
import sys
import re
import xml.etree.ElementTree as ET
import pandas as pd
from urllib.parse import unquote

# 段位の数値から名称へのマッピング (0: 新人 ～ 20: 天鳳位)
RANK_NAMES = [
    '新人','9級','8級','7級','6級','5級','4級','3級','2級','1級',
    '初段','二段','三段','四段','五段','六段','七段','八段','九段','十段',
    '天鳳位'
]

# 局順のマッピング (拡張版)
ROUND_MAP = {
    0: '東一局', 1: '東二局', 2: '東三局', 3: '東四局',
    4: '南一局', 5: '南二局', 6: '南三局', 7: '南四局',
    8: '西一局', 9: '西二局', 10: '西三局', 11: '西四局',
    12: '北一局', 13: '北二局', 14: '北三局', 15: '北四局'
}

# GO タグの type 属性ビットフラグ解析
def parse_table_type(type_val: int) -> (str, bool):
    flags = {
        'demo':     not (type_val & 0x0001),
        'hongpai':  not (type_val & 0x0002),
        'ariari':   not (type_val & 0x0004),
        'dongfeng': not (type_val & 0x0008),
        'sanma':    bool(type_val & 0x0010),
        'soku':     bool(type_val & 0x0040)
    }

    if not flags['sanma']:
        print("四麻は読み込めません。処理を中止します。")
        sys.exit(1)

    level = ((type_val & 0x0020) >> 4) | ((type_val & 0x0080) >> 7)
    parts = ['三', ['般','上','特','鳳'][level], '東' if flags['dongfeng'] else '南']
    if flags['ariari']:
        parts.append('喰')
    if flags['hongpai']:
        parts.append('赤')
    if flags['soku']:
        parts.append('速')

    return ''.join(parts), flags['hongpai']

# 牌番号を電脳麻将形式に変換
def pai(pai_nums: list[int], hongpai: bool) -> str:
    if not pai_nums:
        return None

    parts = []
    sorted_nums = sorted(pai_nums)
    suit_labels = ['m', 'p', 's', 'z']
    prev_suit = None

    for n in sorted_nums:
        suit = suit_labels[n // 36]
        rank = (n % 36) // 4 + 1
        if hongpai and suit != 'z' and rank == 5 and (n % 4 == 0):
            rank = 0
        if suit != prev_suit:
            parts.append(suit)
            prev_suit = suit
        parts.append(str(rank))

    return ''.join(parts)

# 局名を取得
def round_name_from_seed(seed_val: int) -> str:
    return ROUND_MAP.get(seed_val, f'局{seed_val}')

# XMLを解析して全局のデータを取得
def parse_un_from_file(file_path):
    file_name = file_path.split('/')[-1]
    m = re.match(
        r"^tenhou_haifu_(\d{4})_(\d{2})_(\d{2})_(\d{2})_(\d{2})_\d{2}\.xml$",
        file_name
    )
    date_str = f"{m.group(1)}-{m.group(2)}-{m.group(3)}" if m else None
    time_str = f"{m.group(4)}:{m.group(5)}" if m else None

    tree = ET.parse(file_path)
    root = tree.getroot()

    # GOタグ解析
    go_elem = root.find('.//GO')
    if go_elem is not None and 'type' in go_elem.attrib:
        try:
            table_type_str, hongpai_flag = parse_table_type(int(go_elem.attrib['type']))
        except ValueError:
            table_type_str, hongpai_flag = None, False
    else:
        table_type_str, hongpai_flag = None, False

    # TAIKYOKUタグ解析
    taikyoku_elem = root.find('.//TAIKYOKU')
    if taikyoku_elem is not None and 'oya' in taikyoku_elem.attrib:
        try:
            oya_idx = int(taikyoku_elem.attrib['oya'])
        except ValueError:
            oya_idx = None
    else:
        oya_idx = None

    records = []
    seat_labels = ['東家', '南家', '西家']

    # 全ての INIT タグをループ
    for init_elem in root.findall('.//INIT'):
        # seed
        seed_parts = init_elem.attrib.get('seed', '').split(',')
        try:
            seed_ints = [int(x) for x in seed_parts]
        except:
            seed_ints = []

        round_val = seed_ints[0] if len(seed_ints) > 0 else None
        round_name = round_name_from_seed(round_val)

        honba = seed_ints[1] if len(seed_ints) > 1 else None
        kyoutaku = seed_ints[2] if len(seed_ints) > 2 else None
        dice_list = [seed_ints[3]+1, seed_ints[4]+1] if len(seed_ints) > 4 else None
        dorapawn_num = seed_ints[5] if len(seed_ints) > 5 else None
        dora_pai_str = pai([dorapawn_num], hongpai_flag) if dorapawn_num is not None else None

        # INIT 内の oya
        try:
            oya_init = int(init_elem.attrib.get('oya', -1))
        except:
            oya_init = None
        parent_label = seat_labels[oya_init] if oya_init is not None and 0 <= oya_init < 3 else None

        # ten
        ten_parts = init_elem.attrib.get('ten', '').split(',')
        try:
            ten_ints = [int(x) for x in ten_parts]
        except:
            ten_ints = []

        # hai0～hai2
        hai_parts = [init_elem.attrib.get(f'hai{i}', '') for i in range(3)]

        # UNタグごとにプレイヤー情報を取得
        for un in root.findall('.//UN'):
            attrib = un.attrib
            names = [unquote(attrib.get(f'n{i}', '')) or None for i in range(3)]
            dan_list = attrib.get('dan', '').split(',')
            rate_list = attrib.get('rate', '').split(',')

            rec = {
                'ファイル名': file_name,
                '日付': date_str,
                '時間': time_str,
                '卓情報': table_type_str,
                '局': round_name,
                '本場': honba,
                '供託': kyoutaku,
                'サイコロ': dice_list,
                'ドラ表示牌': dora_pai_str,
                '親': parent_label
            }

            for i, seat in enumerate(seat_labels):
                try:
                    rec[f'{seat}_点数'] = ten_ints[i] * 100
                except:
                    rec[f'{seat}_点数'] = None

                raw_hand = hai_parts[i]
                if raw_hand:
                    nums = [int(x) for x in raw_hand.split(',') if x.strip() != '']
                    rec[f'{seat}_手牌'] = pai(nums, hongpai_flag)
                else:
                    rec[f'{seat}_手牌'] = None

                rec[seat] = names[i]
                try:
                    d = int(dan_list[i])
                    rec[f'{seat}_段位'] = RANK_NAMES[d] if 0 <= d < len(RANK_NAMES) else None
                except:
                    rec[f'{seat}_段位'] = None
                try:
                    rec[f'{seat}_レート'] = int(rate_list[i])
                except:
                    rec[f'{seat}_レート'] = None

            records.append(rec)

    return records

def build_dataframe_from_file(file_path):
    data = parse_un_from_file(file_path)
    df = pd.DataFrame(data)
    if '日付' in df.columns:
        df['日付'] = pd.to_datetime(df['日付'], format='%Y-%m-%d', errors='coerce')
    return df

if __name__ == '__main__':
    xml_file = '../../data/source_data/tenhou_source_data/tenhou_haifu/2009/tenhou_haifu_2009_02_20_17_10_72.xml'
    df = build_dataframe_from_file(xml_file)

    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 200)

    display(df.head(20))  # 20行表示して局の連続を確認

Unnamed: 0,ファイル名,日付,時間,卓情報,局,本場,供託,サイコロ,ドラ表示牌,親,東家_点数,東家_手牌,東家,東家_段位,東家_レート,南家_点数,南家_手牌,南家,南家_段位,南家_レート,西家_点数,西家_手牌,西家,西家_段位,西家_レート
0,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,東一局,0,0,"[5, 4]",z1,東家,35000,p4s14406789z1357,僕、おおくぽん,八段,2182,35000,m99p223067s129z55,俊ころ,七段,2049,35000,p3458s34778z2246,楠下幾太郎,七段,2097
1,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,東二局,0,0,"[6, 2]",s6,南家,29000,m19p1123305s37z13,僕、おおくぽん,八段,2182,32000,m119p135s11239z46,俊ころ,七段,2049,44000,p3677789s4059z57,楠下幾太郎,七段,2097
2,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,東二局,1,0,"[3, 6]",s2,南家,26400,m9p35s22456z46667,僕、おおくぽん,八段,2182,37200,m9p4778s1479z2235,俊ころ,七段,2049,41400,m1p158s335z114446,楠下幾太郎,七段,2097
3,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,東二局,2,0,"[4, 1]",z5,南家,26400,p1335678s23678z7,僕、おおくぽん,八段,2182,45100,m9p1789s124678z25,俊ころ,七段,2049,33500,m1p244s14089z3346,楠下幾太郎,七段,2097
4,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,東三局,0,0,"[2, 3]",s7,西家,23200,p14579s4577z1225,僕、おおくぽん,八段,2182,38900,m99p45667s58z1467,俊ころ,七段,2049,42900,m1p37s23466z33477,楠下幾太郎,七段,2097
5,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,南一局,0,0,"[4, 5]",z5,東家,23200,m1p233346s6z11267,僕、おおくぽん,八段,2182,46600,m119p568s255778z7,俊ころ,七段,2049,35200,m9p26999s2349z247,楠下幾太郎,七段,2097
6,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,南二局,0,0,"[3, 4]",p3,南家,20600,p347s34899z23347,僕、おおくぽん,八段,2182,45300,m1p123558s22345z4,俊ころ,七段,2049,39100,m9p1220899s1677z1,楠下幾太郎,七段,2097
7,tenhou_haifu_2009_02_20_17_10_72.xml,2009-02-20,17:10,三鳳南喰赤,南三局,0,0,"[3, 6]",z6,西家,29600,m9p33447s4678z367,僕、おおくぽん,八段,2182,36300,m1p22678s2457z146,俊ころ,七段,2049,39100,m1p16799s37999z25,楠下幾太郎,七段,2097


ログ取得試験用

形式は一定と仮定。
槓とドラめくりの数が同じになるよう確認。
(加カンとドラめくり)のセットで無意味なものが挿入される可能性があり、既存のドラ表示と同じ場合は、どちらも削除する必要がある、統合プロセスで実装する予定。

In [22]:
import xml.etree.ElementTree as ET
import pandas as pd

# ——— ビットマスク定義 ———
MASK_WHO       = 0x0003
MASK_IS_RUN    = 0x0004
MASK_IS_KO     = 0x0008
MASK_IS_KA     = 0x0010
MASK_ADJ1      = 0x0018
MASK_ADJ2      = 0x0060
MASK_ADJ3      = 0x0180
MASK_RUN_PATT  = 0xFC00
MASK_KO_PATT   = 0xFE00
MASK_KAN_PATT  = 0xFF00

WHO_NAME = {
    1: "下家",
    2: "対面",
    3: "上家"
}

SUIT_LABEL = ['m', 'p', 's', 'z']

# ——— ツモ・捨てを電脳麻将形式に変換する関数 ———
def pai(pai_nums: list[int], hongpai: bool) -> str:
    if not pai_nums:
        return None
    parts = []
    sorted_nums = sorted(pai_nums)
    prev_suit = None
    for n in sorted_nums:
        suit = SUIT_LABEL[n // 36]
        rank = (n % 36) // 4 + 1
        if hongpai and suit != 'z' and rank == 5 and (n % 4 == 0):
            rank = 0
        if suit != prev_suit:
            parts.append(suit)
            prev_suit = suit
        parts.append(str(rank))
    return ''.join(parts)

# ——— 順子解析（チー） ———
def parse_shuntu(m: int, hongpai: bool) -> str:
    who_idx = m & MASK_WHO
    p = (m & MASK_RUN_PATT) >> 10
    r_mod = p % 3
    p_base = p // 3
    suit_idx = p_base // 7
    n_base = (p_base % 7) + 1
    nums = [n_base, n_base + 1, n_base + 2]
    adj1 = (m & MASK_ADJ1) >> 3
    adj2 = (m & MASK_ADJ2) >> 5
    adj3 = (m & MASK_ADJ3) >> 7
    suit_char = SUIT_LABEL[suit_idx]
    result = [None, None, None]
    adjs = [adj1, adj2, adj3]
    for i in range(3):
        n_val = nums[i]
        if hongpai and suit_char != 'z' and n_val == 5 and adjs[i] == 0:
            rank_str = '0'
        else:
            rank_str = str(n_val)
        if i == r_mod and who_idx in WHO_NAME:
            rank_str += WHO_NAME[who_idx]
        result[i] = rank_str
    return suit_char + ''.join(result)

# ———刻子・加槓解析（ポン or 加カン） ———
def parse_koutsu_or_kakan(m: int, hongpai: bool) -> str:
    who_idx = m & MASK_WHO
    is_kakan = (m & MASK_IS_KA) != 0
    p = (m & MASK_KO_PATT) >> 9
    r = p % 3
    p_base = p // 3
    suit_idx = p_base // 9
    n_val = (p_base % 9) + 1
    suit_char = SUIT_LABEL[suit_idx]

    # ── 北抜き判定 ──
    if suit_char == 'z' and n_val == 4:
        return "北抜き"

    pon_mahjong = f"{suit_char}{n_val}"
    pon_str = f"{pon_mahjong}を{WHO_NAME.get(who_idx, '')}からポン"
    if not is_kakan:
        return pon_str

    # 加槓
    kan_str = f"{suit_char}{n_val}を加槓"
    return f"{pon_str}→{kan_str}"

# ——— 暗槓／大明槓解析 ———
def parse_ankan_or_daminkan(m: int, hongpai: bool) -> str:
    p = (m & MASK_KAN_PATT) >> 8
    p_base = p // 4
    suit_idx = p_base // 9
    n_val = (p_base % 9) + 1
    suit_char = SUIT_LABEL[suit_idx]
    who_idx = m & MASK_WHO

    # ── 北抜き判定 ──
    if suit_char == 'z' and n_val == 4:
        return "北抜き"

    mahjong = f"{suit_char}{n_val}"
    # 暗槓
    if (m & MASK_IS_KA) == 0:
        return f"{mahjong}を暗槓"
    # 大明槓
    return f"{mahjong}を{WHO_NAME.get(who_idx, '')}から大明槓"

# ——— 面子コード解析メイン ———
def parse_mianzi(m: int, hongpai: bool) -> str:
    # 順子 (チー)
    if (m & MASK_IS_RUN) != 0:
        return parse_shuntu(m, hongpai)
    # 刻子 or 加槓
    if (m & MASK_IS_KO) != 0 or (m & MASK_IS_KA) != 0:
        return parse_koutsu_or_kakan(m, hongpai)
    # 暗槓／大明槓
    return parse_ankan_or_daminkan(m, hongpai)

# ——— GO タグから赤牌フラグを得る ———
def parse_hongpai_flag(root) -> bool:
    go_elem = root.find('.//GO')
    if go_elem is not None and 'type' in go_elem.attrib:
        type_val = int(go_elem.attrib['type'])
        return not bool(type_val & 0x0002)
    return False

# ——— ログ列だけを抽出して DataFrame にする ———
def build_log_dataframe(file_path: str) -> pd.DataFrame:
    context = ET.iterparse(file_path, events=('start', 'end'))
    _, root = next(context)
    hongpai_flag = parse_hongpai_flag(root)

    logs = []
    pending_kan = 0

    for event, elem in context:
        tag = elem.tag.upper()
        # ツモ／捨て
        if event == 'start' and tag and tag[0] in ['T', 'D', 'U', 'E', 'V', 'F']:
            try:
                n_val = int(tag[1:])
            except:
                pass
            else:
                who = '東家' if tag[0] in ['T', 'D'] else '南家' if tag[0] in ['U', 'E'] else '西家'
                action = 'ツモ' if tag[0] in ['T', 'U', 'V'] else '捨て'
                logs.append(f"{who}:{action} {pai([n_val], hongpai_flag)}")
        # 鳴き
        if event == 'start' and tag == 'N':
            who = ['東家', '南家', '西家', '北家'][int(elem.attrib.get('who', -1))]
            m_code = int(elem.attrib.get('m', 0))
            parsed = parse_mianzi(m_code, hongpai_flag)
            if '槓' in parsed:
                pending_kan += 1
            logs.append(f"{who}:鳴き {parsed}")
        # ドラ表示
        if event == 'start' and tag == 'DORA':
            if pending_kan <= 0:
                raise ValueError(f"Unexpected DORA tag without matching Kan at line {elem.sourceline}")
            hai_str = elem.attrib.get('hai')
            if not hai_str:
                raise ValueError("DORA tag missing 'hai' attribute")
            try:
                hai = int(hai_str)
            except ValueError:
                raise ValueError(f"Invalid hai='{hai_str}' in DORA tag")
            logs.append(f"ドラめくり {pai([hai], hongpai_flag)}")
            pending_kan -= 1
        elem.clear()

    if pending_kan != 0:
        raise ValueError(f"Missing DORA tag for {pending_kan} Kan operation(s)")
    return pd.DataFrame({'ログ': logs})

# ——— 実行例 ———
if __name__ == '__main__':
    file_path = '../../data/source_data/tenhou_source_data/tenhou_haifu/2009/tenhou_haifu_2009_02_20_17_10_72.xml'
    df_logs = build_log_dataframe(file_path)
    display(df_logs)
    #display(df_logs[df_logs['ログ'].str.contains('鳴き', na=False)])
    #display(df_logs[df_logs['ログ'].str.contains('ドラめくり', na=False)])

Unnamed: 0,ログ
0,東家:ツモ p7
1,東家:捨て z3
2,南家:ツモ m9
3,南家:捨て s9
4,西家:ツモ s4
...,...
555,東家:捨て m9
556,南家:ツモ m1
557,南家:捨て m1
558,西家:ツモ p2


AGARIタグの取得ロジック(INITのoyaに対応して正しく反映するように将来的に修正)

In [19]:
import xml.etree.ElementTree as ET
import pandas as pd

# 役名リスト
HUPAI_NAMES = [
    '門前清自摸和', '立直', '一発', '槍槓', '嶺上開花',
    '海底摸月', '河底撈魚', '平和', '断幺九', '一盃口',
    '自風 東', '自風 南', '自風 西', '自風 北', '場風 東',
    '場風 南', '場風 西', '場風 北', '役牌 白', '役牌 發',
    '役牌 中', '両立直', '七対子', '混全帯幺九', '一気通貫',
    '三色同順', '三色同刻', '三槓子', '対々和', '三暗刻',
    '小三元', '混老頭', '二盃口', '純全帯幺九', '混一色',
    '清一色', '', '天和', '地和', '大三元',
    '四暗刻', '四暗刻単騎', '字一色', '緑一色', '清老頭',
    '九蓮宝燈', '純正九蓮宝燈', '国士無双', '国士無双１３面', '大四喜',
    '小四喜', '四槓子', 'ドラ', '裏ドラ', '赤ドラ',
]

# AGARI 要素から和了情報を抽出
def parse_agari(elem: ET.Element, oya: int) -> dict:
    attr = elem.attrib
    who      = int(attr['who'])
    ten      = list(map(int, attr['ten'].split(',')))  # [符, 点数]
    sc_raw   = list(map(int, attr['sc'].split(',')))
    sc       = [sc_raw[i] for i in (1,3,5,7)]
    sc       = sc[oya:] + sc[:oya]

    # 役一覧と翻数
    yaku_list    = [int(x) for x in attr.get('yaku','').split(',') if x]
    yakuman_list = [int(x) for x in attr.get('yakuman','').split(',') if x]
    hupai = []
    fanshu_total = 0
    if yakuman_list:
        # 役満は '*' 表示
        for ym in yakuman_list:
            hupai.append({'name': HUPAI_NAMES[ym], 'fanshu': '*'})
    else:
        for i in range(0, len(yaku_list), 2):
            idx, fu = yaku_list[i], yaku_list[i+1]
            hupai.append({'name': HUPAI_NAMES[idx], 'fanshu': fu})
            fanshu_total += fu

    # 手牌：元の数字リストを電脳麻将形式に変換
    '''
    hai_tiles = attr['hai']  # 例: "71,85,48,..."
    machi     = attr['machi']
    shoupai_str = hai_tiles
    horahai_str = machi
    '''
    
    hai = list(map(int, attr['hai'].split(',')))
    machi = list(map(int, attr['machi'].split(',')))
    hongpai_flag = bool(attr.get('hongpai', False))

    shoupai_str = pai(hai, hongpai_flag)
    horahai_str = pai(machi, hongpai_flag)
    
    # scを100倍に
    sc = [x * 100 for x in sc]
    
    result = {
        'l':        (who + 4 - oya) % 4,
        '手牌':      shoupai_str,
        '和了牌':    horahai_str,
        '和了打点':  ten[1],
        '符':        ten[0],
        '翻数':      fanshu_total if not yakuman_list else '*',
        '局収支':    sc,
        '役一覧':    hupai,
    }
    if yakuman_list:
        result['役満複合数'] = len(yakuman_list)

    return {'hule': result}

# ログ全体を DataFrame に変換
def parse_log_to_dataframe(xml_path: str, oya: int = 0) -> pd.DataFrame:
    tree = ET.parse(xml_path)
    root = tree.getroot()
    records = [parse_agari(ag, oya)['hule'] for ag in root.findall('.//AGARI')]
    return pd.DataFrame(records)

# 使用例
if __name__ == '__main__':
    file_path = '../../data/source_data/tenhou_source_data/tenhou_haifu/2009/tenhou_haifu_2009_02_20_17_10_72.xml'
    df = parse_log_to_dataframe(file_path, oya=1)
    display(df.head())
    display(df['役一覧'].head())

Unnamed: 0,l,手牌,和了牌,和了打点,符,翻数,局収支,役一覧
0,1,p345s23445677z222,s6,9000,30,7,"[-3000, 10000, 0, -6000]","[{'name': '立直', 'fanshu': 1}, {'name': '門前清自摸和..."
1,0,m11p123567s123456,p7,5200,20,4,"[6200, -2600, 0, -2600]","[{'name': '立直', 'fanshu': 1}, {'name': '門前清自摸和..."
2,0,p45667899s789z222,p9,7700,40,3,"[8900, -7900, 0, 0]","[{'name': '立直', 'fanshu': 1}, {'name': '裏ドラ', ..."
3,1,p345s345789z33,s3,9000,40,7,"[-6200, 9400, 0, -3200]","[{'name': '役牌 發', 'fanshu': 1}, {'name': 'ドラ',..."
4,0,m999p455667s99,m9,7700,30,4,"[7700, -7700, 0, 0]","[{'name': '河底撈魚', 'fanshu': 1}, {'name': '役牌 發..."


0    [{'name': '立直', 'fanshu': 1}, {'name': '門前清自摸和...
1    [{'name': '立直', 'fanshu': 1}, {'name': '門前清自摸和...
2    [{'name': '立直', 'fanshu': 1}, {'name': '裏ドラ', ...
3    [{'name': '役牌 發', 'fanshu': 1}, {'name': 'ドラ',...
4    [{'name': '河底撈魚', 'fanshu': 1}, {'name': '役牌 發...
Name: 役一覧, dtype: object

統合したコード

試験用

ログの試験

In [90]:
import os

def build_log_dataframe_with_info(file_path:str)->pd.DataFrame:
    tree=ET.parse(file_path)
    root=tree.getroot()
    hongpai_flag=parse_hongpai_flag(root)
    logs=[]
    pending_kan=0

    seat_labels=['東家','南家','西家']
    current_round=None
    parent_label=None
    player_names=[None,None,None]
    player_scores=[None,None,None]
    file_name=os.path.basename(file_path)  # ファイル名を取得

    for init_elem in root.findall('.//INIT'):
        # 局情報
        seed_parts=init_elem.attrib.get('seed','').split(',')
        round_val=int(seed_parts[0]) if len(seed_parts)>0 else 0
        current_round=round_name_from_seed(round_val)
        try:
            oya_idx=int(init_elem.attrib.get('oya',-1))
            parent_label=seat_labels[oya_idx] if 0<=oya_idx<3 else None
        except:
            parent_label=None
        ten_parts=init_elem.attrib.get('ten','').split(',')
        player_scores=[int(x)*100 for x in ten_parts[:3]] if ten_parts else [None,None,None]

        # UNタグでプレイヤー名を取得
        un_elem=root.find('.//UN')
        if un_elem is not None:
            player_names=[unquote(un_elem.attrib.get(f'n{i}','')) or None for i in range(3)]

        # INIT→ログの順でその局の行動を取得
        for elem in init_elem.itersiblings() if hasattr(init_elem,'itersiblings') else root.iter():
            if elem.tag.upper() in ['INIT','TAIKYOKU']:
                continue
            tag=elem.tag.upper()
            row_base={
                'ファイル名': file_name,
                '局':current_round,
                '親':parent_label,
                '東家':player_names[0],
                '南家':player_names[1],
                '西家':player_names[2],
                '東家_点数':player_scores[0],
                '南家_点数':player_scores[1],
                '西家_点数':player_scores[2],
            }
            if tag[0] in ['T','D','U','E','V','F']:
                try:
                    n_val=int(tag[1:])
                except:
                    continue
                who='東家' if tag[0] in ['T','D'] else '南家' if tag[0] in ['U','E'] else '西家'
                action='ツモ' if tag[0] in ['T','U','V'] else '捨て'
                row_base['ログ'] = f"{who}:{action} {pai([n_val],hongpai_flag)}"
                logs.append(row_base)
            elif tag=='N':
                try:
                    who_idx=int(elem.attrib.get('who',-1))
                    who=['東家','南家','西家','北家'][who_idx]
                except:
                    who='Unknown'
                m_code=int(elem.attrib.get('m',0))
                parsed=parse_mianzi(m_code,hongpai_flag)
                if parsed:
                    if '槓' in parsed:
                        pending_kan+=1
                    row_base['ログ'] = f"{who}:鳴き {parsed}"
                    logs.append(row_base)
            elif tag=='DORA' and pending_kan>0:
                try:
                    hai=int(elem.attrib.get('hai'))
                    row_base['ログ'] = f"ドラめくり {pai([hai],hongpai_flag)}"
                    logs.append(row_base)
                    pending_kan-=1
                except:
                    continue

    return pd.DataFrame(logs)

# ——— 実行例 ———
if __name__=='__main__':
    file_path='../../data/source_data/tenhou_source_data/tenhou_haifu/2009/tenhou_haifu_2009_02_20_17_10_72.xml'
    df_logs=build_log_dataframe_with_info(file_path)
    pd.set_option('display.max_columns',None)
    pd.set_option('display.width',200)
    display(df_logs.head(562))


Unnamed: 0,ファイル名,局,親,東家,南家,西家,東家_点数,南家_点数,西家_点数,ログ
0,tenhou_haifu_2009_02_20_17_10_72.xml,東一局,東家,僕、おおくぽん,俊ころ,楠下幾太郎,35000,35000,35000,東家:ツモ 62
1,tenhou_haifu_2009_02_20_17_10_72.xml,東一局,東家,僕、おおくぽん,俊ころ,楠下幾太郎,35000,35000,35000,東家:捨て 116
2,tenhou_haifu_2009_02_20_17_10_72.xml,東一局,東家,僕、おおくぽん,俊ころ,楠下幾太郎,35000,35000,35000,南家:ツモ 35
3,tenhou_haifu_2009_02_20_17_10_72.xml,東一局,東家,僕、おおくぽん,俊ころ,楠下幾太郎,35000,35000,35000,南家:捨て 105
4,tenhou_haifu_2009_02_20_17_10_72.xml,東一局,東家,僕、おおくぽん,俊ころ,楠下幾太郎,35000,35000,35000,西家:ツモ 86
...,...,...,...,...,...,...,...,...,...,...
557,tenhou_haifu_2009_02_20_17_10_72.xml,東一局,東家,僕、おおくぽん,俊ころ,楠下幾太郎,35000,35000,35000,西家:捨て 43
558,tenhou_haifu_2009_02_20_17_10_72.xml,東二局,南家,僕、おおくぽん,俊ころ,楠下幾太郎,29000,32000,44000,東家:ツモ 62
559,tenhou_haifu_2009_02_20_17_10_72.xml,東二局,南家,僕、おおくぽん,俊ころ,楠下幾太郎,29000,32000,44000,東家:捨て 116
560,tenhou_haifu_2009_02_20_17_10_72.xml,東二局,南家,僕、おおくぽん,俊ころ,楠下幾太郎,29000,32000,44000,南家:ツモ 35
