# 1. モジュールインポート

In [1]:
import pandas as pd
import glob
import os
import datetime
from tqdm.auto import tqdm
from modules.constants import Master
from modules.constants import LocalPaths
from modules.constants import HorseResultsCols
from modules.constants import ResultsCols
from modules import preparing
from modules import preprocessing
from modules import training
from modules import simulation
from modules import policies
%load_ext autoreload

In [None]:
# エンコーディング設定の確認
import sys
import locale

print("=== エンコーディング設定確認 ===")
print(f"sys.getdefaultencoding(): {sys.getdefaultencoding()}")
print(f"sys.getfilesystemencoding(): {sys.getfilesystemencoding()}")
print(f"locale.getpreferredencoding(): {locale.getpreferredencoding()}")
print(f"PYTHONUTF8 環境変数: {sys.flags.utf8_mode}")

標準的な土日競馬開催時の運用スケジュールを以下の表の通り。

|曜日|時刻|内容|実行する main.ipynb の項番|備考|
|:-:|:--|:--|:--|:--|
|月|||||
|火|||||
|水|16:30過ぎ|先週土日の馬の過去成績ページ確定<BR>（netkeiba.comﾌﾟﾚﾐｱｻｰﾋﾞｽのﾀｲﾑ指数・ﾚｰｽ分析・注目馬 ﾚｰｽ後の短評情報確定）|2. データ取得 ～ 5. シミュレーション|3日間開催の場合も、水曜日|
|木|||||
|金|10:05過ぎ<BR>19:25過ぎ|土曜の出馬表確定<BR>土曜の天候・馬場状態更新|6.1. 前日準備 ～ 6.2. 前日全レース予想（天候・馬場状態は手動設定）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想||
|土|09:00～17:00<BR>10:05過ぎ<BR>19:25過ぎ| レース時刻<BR>日曜の出馬表確定<BR>日曜の天候・馬場状態更新|6.3. レース直前データ処理（当日レース予想）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想（天候・馬場状態は手動設定）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想||
|日|09:00～17:00|レース時刻|6.3. レース直前データ処理（当日レース予想）||

# 2. データ取得

## 2.1. レースID取得
例として、2020年のレースデータを取得する場合を考える

In [None]:
%autoreload

In [None]:
#開催日取得。to_の月は含まないので注意。
kaisai_date_2025 = preparing.scrape_kaisai_date(from_="2025-12-01", to_="2026-01-01")
len(kaisai_date_2025)

In [None]:
# 開催日からレースIDの取得
race_id_list = preparing.scrape_race_id_list(kaisai_date_2025)
len(race_id_list)

## 2.2. /race/ディレクトリのデータ取得

In [None]:
#https://db.netkeiba.com/race/のhtml(binファイル)をスクレイピングして保存
html_files_race = preparing.scrape_html_race(race_id_list, skip=False)
html_files_race[:5]

In [None]:
# data/html/race/に保存されているhtml(binファイル)をリストにする
import glob
import os

# LocalPathsからHTMLレースディレクトリを取得
race_html_dir = LocalPaths.HTML_RACE_DIR
print(f"レースHTMLディレクトリ: {race_html_dir}")

# globでbinファイルを検索
html_files_race = glob.glob(os.path.join(race_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_race)}")

# 最初の5ファイルを表示
html_files_race[:5]
html_files_race[:5]

In [None]:
# 変数確認
print(f"html_files_race変数の状態:")
print(f"  タイプ: {type(html_files_race)}")
print(f"  サイズ: {len(html_files_race)}")
print(f"  範囲: {html_files_race[0].split('\\')[-1]} ～ {html_files_race[-1].split('\\')[-1]}")

# これで次のセルでget_rawdata_results関数を正常に実行できます

In [None]:
results_new = preparing.get_rawdata_results(html_files_race) #レース結果テーブルの作成
race_info_new = preparing.get_rawdata_info(html_files_race) #レース情報テーブルの作成
return_tables_new = preparing.get_rawdata_return(html_files_race) #払戻テーブルの作成

In [None]:
# テーブルの更新。元々のテーブルが存在しない場合は、新たに作成される。
preparing.update_rawdata(filepath=LocalPaths.RAW_RESULTS_PATH, new_df=results_new)
preparing.update_rawdata(filepath=LocalPaths.RAW_RACE_INFO_PATH, new_df=race_info_new)
preparing.update_rawdata(filepath=LocalPaths.RAW_RETURN_TABLES_PATH, new_df=return_tables_new)

## 2.x. 生成済み raw テーブル確認
`data/raw` に保存された各pickleの基本情報を表示します。存在しない場合はスキップします。

In [None]:
# data/raw 配下の pickle テーブル概要確認 + null率集計
import os
import pathlib
import pandas as pd
import datetime as dt
from modules.constants import LocalPaths

RAW_DIR = pathlib.Path('data/raw')

if not RAW_DIR.exists():
    print(f'ディレクトリが存在しません: {RAW_DIR.resolve()}')
else:
    pickle_files = sorted(RAW_DIR.glob('*.pickle'))
    if not pickle_files:
        print('pickleファイルが見つかりません。先に取得処理を実行してください。')
    else:
        summaries = []
        null_detail_rows = []  # 列単位 null 率詳細
        for p in pickle_files:
            info = {
                'file': p.name,
                'size_MB': round(p.stat().st_size / 1_000_000, 3)
            }
            try:
                df = pd.read_pickle(p)
                info['rows'] = len(df)
                info['cols'] = df.shape[1]
                info['memory_MB'] = round(df.memory_usage(deep=True).sum() / 1_000_000, 3)
                # 代表的なカラムサンプル（最大5件）
                info['sample_cols'] = ', '.join(list(df.columns[:5]))
                # 日付らしき列から範囲を取得
                date_cols = [c for c in df.columns if 'date' in c.lower()]
                date_range = ''
                for dc in date_cols:
                    try:
                        s = pd.to_datetime(df[dc], errors='coerce')
                        if s.notna().any():
                            date_range = f"{dc}:{s.min().date()}→{s.max().date()}"
                            break
                    except Exception:
                        pass
                info['date_range'] = date_range
                # 全体 null 率（セル全体）
                total_cells = df.shape[0] * (df.shape[1] if df.shape[0] else 0)
                info['overall_null_pct'] = round((df.isna().sum().sum() / total_cells) * 100, 2) if total_cells else 0.0
                # 列ごとの null 率
                col_null_pct = (df.isna().mean() * 100).sort_values(ascending=False)
                # 上位10列を詳細に保存（列が10未満なら全て）
                for col, pct in col_null_pct.head(10).items():
                    null_detail_rows.append({
                        'file': p.name,
                        'column': col,
                        'null_pct': round(pct, 2)
                    })
                # 列単位統計（最大値/平均値/中央値）
                info['max_col_null_pct'] = round(col_null_pct.iloc[0], 2) if not col_null_pct.empty else 0.0
                info['mean_col_null_pct'] = round(col_null_pct.mean(), 2) if not col_null_pct.empty else 0.0
                info['median_col_null_pct'] = round(col_null_pct.median(), 2) if not col_null_pct.empty else 0.0
            except Exception as e:
                info['rows'] = 'ERR'
                info['cols'] = 'ERR'
                info['memory_MB'] = 'ERR'
                info['sample_cols'] = f'load error: {e.__class__.__name__}'
                info['date_range'] = ''
                info['overall_null_pct'] = 'ERR'
                info['max_col_null_pct'] = 'ERR'
                info['mean_col_null_pct'] = 'ERR'
                info['median_col_null_pct'] = 'ERR'
            summaries.append(info)
        summary_df = pd.DataFrame(summaries)
        # 表示順を調整
        summary_cols_order = [
            'file','rows','cols','size_MB','memory_MB','overall_null_pct',
            'max_col_null_pct','mean_col_null_pct','median_col_null_pct',
            'sample_cols','date_range'
        ]
        summary_df = summary_df[summary_cols_order]
        display(summary_df)

        if null_detail_rows:
            null_detail_df = pd.DataFrame(null_detail_rows)
            # ファイル毎に null の高い列を横持ち要約（pivot）するオプション（必要であれば）
            display(null_detail_df)

        # 主要パスが指すファイルの存在と行数確認（存在しない場合も出力）
        main_paths = {
            'RAW_RESULTS_PATH': getattr(LocalPaths, 'RAW_RESULTS_PATH', None),
            'RAW_RACE_INFO_PATH': getattr(LocalPaths, 'RAW_RACE_INFO_PATH', None),
            'RAW_RETURN_TABLES_PATH': getattr(LocalPaths, 'RAW_RETURN_TABLES_PATH', None),
            'RAW_HORSE_INFO_PATH': getattr(LocalPaths, 'RAW_HORSE_INFO_PATH', None),
            'RAW_HORSE_RESULTS_PATH': getattr(LocalPaths, 'RAW_HORSE_RESULTS_PATH', None),
            'RAW_PEDS_PATH': getattr(LocalPaths, 'RAW_PEDS_PATH', None)
        }
        path_rows = []
        for key, path in main_paths.items():
            if path is None:
                path_rows.append({'name': key, 'path': None, 'exists': False, 'rows': None})
                continue
            exists = os.path.isfile(path)
            rows = None
            if exists:
                try:
                    rows = len(pd.read_pickle(path))
                except Exception:
                    rows = 'ERR'
            path_rows.append({'name': key, 'path': path, 'exists': exists, 'rows': rows})
        display(pd.DataFrame(path_rows))

In [None]:
# 既存のresultsデータを読み込んでテスト用horse_idリストを取得
results_new = pd.read_pickle(LocalPaths.RAW_RESULTS_PATH)
print(f"results_new loaded: {results_new.shape}")

# 先頭10頭のテスト用リスト作成
horse_id_list = results_new['horse_id'].unique()
horse_id_test_list = horse_id_list[:10]
print(f"テスト用horse_id: {horse_id_test_list}")

## 2.3. /horse/ディレクトリのデータ取得

In [None]:
%autoreload

In [None]:
horse_id_list = results_new['horse_id'].unique()
#htmlをスクレイピング
#すでにスクレイピングしてある馬をスキップしたい場合はskip=Trueにする
#すでにスクレイピングしてある馬でも、新たに出走した成績を更新したい場合はskip=Falseにする
html_files_horse = preparing.scrape_html_horse_with_master(
    horse_id_list, skip=True
    )

In [None]:
#追加で新たにスクレイピングされた数
len(html_files_horse)

In [None]:
### scrape関数を実行せずに、保存してあるhtmlのパスを取得する場合、以下を実行 ###

target_date = '2025-09-20' #スクレイピングした日付を指定
# マスタの読み込み
update_master = pd.read_csv(
    LocalPaths.MASTER_RAW_HORSE_RESULTS_PATH,
    dtype=object
    )
# target_dateにスクレイピングしたhorse_idに絞り込む
filter = pd.to_datetime(update_master['updated_at']).dt.strftime('%Y-%m-%d') == target_date
horse_id_list = update_master[filter]['horse_id']

# binファイルのパスを取得
html_files_horse = []
for horse_id in tqdm(horse_id_list):
    file = glob.glob(os.path.join(LocalPaths.HTML_HORSE_DIR, horse_id+'*.bin'))[0]
    html_files_horse.append(file)
html_files_horse[:5]

In [None]:
# data/html/horse/に保存されているhtml(binファイル)をリストにする

import glob
import os

horse_html_dir = LocalPaths.HTML_HORSE_DIR
print(f"馬HTMLディレクトリ: {horse_html_dir}")

html_files_horse = glob.glob(os.path.join(horse_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_horse)}")

# 最初の5ファイルを表示
html_files_horse[:5]


In [None]:
# horse_info.pickleを読み込み
try:
    horse_info = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
    print(f"horse_info.pickle読み込み完了: {len(horse_info)}頭の馬データ")
except Exception as e:
    print(f"horse_info.pickleの読み込みエラー: {e}")
    horse_info = None

In [None]:
# 馬の基本情報テーブルの更新
preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info)

In [None]:
# 馬の過去成績テーブルの作成
horse_results_new = preparing.get_rawdata_horse_results(html_files_horse)

In [None]:
# テーブルの更新
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_new)

In [None]:
display(horse_results_new)

### 2.3.2 特定期間の再スクレイピング

In [2]:
import pandas as pd

from modules import preparing
from modules.constants import LocalPaths

# 1) 対象期間
START = "2026-01-01"
END   = "2026-03-01"  # この日付は含めない想定（必要なら調整）

# 2) 開催日(yyyymmdd) -> レースID取得
# scrape_kaisai_date は yyyy-mm を受け取り「to_月は含まない」仕様
kaisai_date_list = preparing.scrape_kaisai_date("2026-01", "2026-03")
# 念のため日付でフィルタ（文字列比較でOK: yyyymmdd）
start_yyyymmdd = START.replace("-", "")
end_yyyymmdd   = END.replace("-", "")
kaisai_date_list = [d for d in kaisai_date_list if start_yyyymmdd <= d < end_yyyymmdd]

race_id_list = preparing.scrape_race_id_list(kaisai_date_list)
race_id_list = sorted(set(race_id_list))
print("race_id count:", len(race_id_list))

# 3) レースHTMLを再取得（skip=False で上書き）
race_html_paths = preparing.scrape_html_race(race_id_list, skip=False)
print("race html updated:", len(race_html_paths))

# 4) レース結果→出走馬ID抽出
results_new = preparing.get_rawdata_results(race_html_paths)
horse_id_list = sorted(set(results_new["horse_id"].dropna().astype(str).tolist()))
print("horse_id count:", len(horse_id_list))

# （任意だが推奨）レース系rawも同時に更新して整合を取る
race_info_new = preparing.get_rawdata_info(race_html_paths)
return_new = preparing.get_rawdata_return(race_html_paths)

preparing.update_rawdata(LocalPaths.RAW_RESULTS_PATH, results_new, mode="replace")
preparing.update_rawdata(LocalPaths.RAW_RACE_INFO_PATH, race_info_new, mode="replace")
preparing.update_rawdata(LocalPaths.RAW_RETURN_TABLES_PATH, return_new, mode="replace")

# 5) 馬ページ（戦績断片を挿入する実装）を再取得して、更新日時マスタも更新
horse_html_paths = preparing.scrape_html_horse_with_master(horse_id_list, skip=False)
print("horse html updated:", len(horse_html_paths))

# 6) horse_info / horse_results を作ってraw pickle更新
horse_info_new = preparing.get_rawdata_horse_info(horse_html_paths)
horse_results_new = preparing.get_rawdata_horse_results(horse_html_paths)

preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info_new, mode="replace")
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_new, mode="replace")

print("done")

getting race date from 2026-01 to 2026-03
Date range created: 2 months to process


  0%|          | 0/2 [00:00<?, ?it/s]

getting race_id_list


  0%|          | 0/13 [00:00<?, ?it/s]

scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260104
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260105
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260110
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260111
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260112
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260117
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260118
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260124
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260125
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260131
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260201
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260207
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20260208
race_id coun

  0%|          | 0/384 [00:00<?, ?it/s]

race_id 202605010301 skipped. This page is not valid (blocked or not a result page).
race_id 202605010302 skipped. This page is not valid (blocked or not a result page).
race_id 202605010303 skipped. This page is not valid (blocked or not a result page).
race_id 202605010304 skipped. This page is not valid (blocked or not a result page).
race_id 202605010305 skipped. This page is not valid (blocked or not a result page).
race_id 202605010306 skipped. This page is not valid (blocked or not a result page).
race_id 202605010307 skipped. This page is not valid (blocked or not a result page).
race_id 202605010308 skipped. This page is not valid (blocked or not a result page).
race_id 202605010309 skipped. This page is not valid (blocked or not a result page).
race_id 202605010310 skipped. This page is not valid (blocked or not a result page).
race_id 202605010311 skipped. This page is not valid (blocked or not a result page).
race_id 202605010312 skipped. This page is not valid (blocked or 

  0%|          | 0/312 [00:00<?, ?it/s]

horse_id count: 3832
preparing raw race_info table


  0%|          | 0/312 [00:00<?, ?it/s]

preparing raw return table


  0%|          | 0/312 [00:00<?, ?it/s]

置換モード: 保持 5924059 + 置換 4483 = 合計 5928542 レコード
データ更新完了: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\raw\results.pickle
置換モード: 保持 38002 + 置換 312 = 合計 38314 レコード
データ更新完了: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\raw\race_info.pickle
置換モード: 保持 1865201 + 置換 2486 = 合計 1867687 レコード
データ更新完了: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\raw\return_tables.pickle
scraping


  0%|          | 0/3832 [00:00<?, ?it/s]

updating master
horse html updated: 3832
preparing raw horse_info table


  0%|          | 0/3832 [00:00<?, ?it/s]

preparing raw horse_results table


  0%|          | 0/3832 [00:00<?, ?it/s]

置換モード: 保持 17980 + 置換 3832 = 合計 21812 レコード
データ更新完了: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\raw\horse_info.pickle
置換モード: 保持 44392372 + 置換 38383 = 合計 44430755 レコード
データ更新完了: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\raw\horse_results.pickle
done


## 2.4. /ped/ディレクトリのデータ取得

In [3]:
html_files_peds = preparing.scrape_html_ped(horse_id_list, skip=True) #htmlをスクレイピング

  0%|          | 0/3832 [00:00<?, ?it/s]

horse_id 2015104699 skipped
horse_id 2016100126 skipped
horse_id 2016100158 skipped
horse_id 2016102175 skipped
horse_id 2016102767 skipped
horse_id 2016104152 skipped
horse_id 2016104668 skipped
horse_id 2016105797 skipped
horse_id 2017100464 skipped
horse_id 2017100583 skipped
horse_id 2017100655 skipped
horse_id 2017101423 skipped
horse_id 2017101560 skipped
horse_id 2017103184 skipped
horse_id 2017104238 skipped
horse_id 2017104266 skipped
horse_id 2017105433 skipped
horse_id 2017105634 skipped
horse_id 2017105822 skipped
horse_id 2017106153 skipped
horse_id 2017106372 skipped
horse_id 2017106642 skipped
horse_id 2017106722 skipped
horse_id 2017110086 skipped
horse_id 2017110138 skipped
horse_id 2018100976 skipped
horse_id 2018101010 skipped
horse_id 2018101166 skipped
horse_id 2018101243 skipped
horse_id 2018101343 skipped
horse_id 2018101615 skipped
horse_id 2018101989 skipped
horse_id 2018102020 skipped
horse_id 2018102040 skipped
horse_id 2018102348 skipped
horse_id 2018102394 

In [4]:
peds_new = preparing.get_rawdata_peds(html_files_peds) #血統テーブルの作成
preparing.update_rawdata(LocalPaths.RAW_PEDS_PATH, peds_new) #テーブルの更新

preparing raw peds table


0it [00:00, ?it/s]

preparing update raw data empty


Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2011106610,1999100226,000a000082,000a0012cb,000a000f2b,000a001042,000a0078a6,000a007c38,000a000f87,000a007d0b,000a00877c,...,000a008e05,000a000ded,000a008e04,000a00ae88,000a000e32,000a000e03,000a007ccf,000a007259,000a000de4,000a007b7d
2012100683,2003104570,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a0082aa,000a001383,000a007cea,1982105011,000a000081,000a000e03,000a007054,1955101622,000a00026d,000a0031be
2012103532,1994108729,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a009232,000a001bd8,000a009231,1990109129,000a001b87,000a0000d3,000a00909c,000a006409,000a000e0e,000a007ca0
2012104463,2003102205,000a000d77,000a00185d,000a000e04,000a000f8c,000a00702e,000a008892,000a001183,000a0081e9,000a009851,...,000a008eb9,000a000dda,000a0216c4,000a00fa9e,000a0016db,000a000e04,000a00836d,000a00fa9f,000a000db7,000a00faa0
2012104504,1999106689,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a009de9,000a0012be,000a0086f3,000a00a2b0,000a00193e,000a0010e2,000a007e1d,000a00a2af,000a000ded,000a00a2ae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013102443,1997101264,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a007459,000a0013f0,000a0072a0,000a008c1e,000a0019b6,000a000e20,000a006c87,000a008c1d,000a0019ba,000a008c1c
2013105424,1994108729,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a00903f,000a000f32,000a008c57,000a0104e0,000a001362,000a000f12,000a00032d,000a0104df,000a000e1d,000a000173
2013104725,000a0109d5,000a001a98,000a0016d4,000a000e04,000a000f8c,000a00702e,000a00834c,000a0016d3,000a00834b,000a008e05,...,000a009821,000a000ded,000a009820,000a00a7d5,000a00188a,000a000da6,000a0088ea,000a009d0d,000a001e5e,000a009d0c
2013105198,2007102923,2000101426,000a00033a,000a0012bf,000a000f2b,000a007459,000a008c1e,000a0019b6,000a008c1d,000a000285,...,000a0083aa,000a000e94,000a007638,000a01058b,000a000efe,000a000e46,000a007535,000a01058a,000a00147c,000a00702c


In [None]:
display(peds_new)

# 3. データ加工

In [5]:
#モジュールを更新した際、notebookに反映させるために使用。
#すでにインポートしてあるモジュールの更新が反映される。
%autoreload

In [2]:
#前処理
results_processor = preprocessing.ResultsProcessor(filepath=LocalPaths.RAW_RESULTS_PATH)
race_info_processor = preprocessing.RaceInfoProcessor(filepath=LocalPaths.RAW_RACE_INFO_PATH)
return_processor = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)
horse_info_processor = preprocessing.HorseInfoProcessor(
    filepath=LocalPaths.RAW_HORSE_INFO_PATH)
horse_results_processor = preprocessing.HorseResultsProcessor(
    filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
peds_processor = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


In [7]:
# 騎手成績特徴量（直近10・50レース複勝率）の作成
from modules.constants import LocalPaths
from modules import preprocessing
import os
import pandas as pd

# data/tmp ディレクトリを作成
os.makedirs(LocalPaths.TMP_DIR, exist_ok=True)

# RAW_HORSE_RESULTS から騎手複勝率特徴量を作成して保存
jockey_stats_processor = preprocessing.JockeyStatsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
jockey_stats = jockey_stats_processor.preprocessed_data
jockey_stats.to_pickle(LocalPaths.JOCKEY_STATS_PATH)
jockey_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,jockey_id,jockey_plc_rate_10_all,jockey_rides_10_all,jockey_plc_rate_50_all,jockey_rides_50_all,jockey_has_history_flag
date,horse_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-12-23,2005105894,和田竜二,1.0,10,1.0,50,1
2008-01-06,2005105204,渡辺薫彦,1.0,10,1.0,50,1
2008-01-19,2005105894,角田晃一,1.0,10,1.0,50,1
2008-01-20,2005105204,池添謙一,0.0,10,0.0,50,1
2008-03-01,2005105894,和田竜二,0.0,10,0.0,50,1


In [4]:
# MyKeiba-AI_v2\data\tmp\jockey_stats.pickle のロード

jockey_stats = pd.read_pickle(LocalPaths.JOCKEY_STATS_PATH)
jockey_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,jockey_id,jockey_plc_rate_10_all,jockey_rides_10_all,jockey_plc_rate_50_all,jockey_rides_50_all,jockey_has_history_flag
date,horse_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-12-23,2005105894,和田竜二,1.0,10,1.0,50,1
2008-01-06,2005105204,渡辺薫彦,1.0,10,1.0,50,1
2008-01-19,2005105894,角田晃一,1.0,10,1.0,50,1
2008-01-20,2005105204,池添謙一,0.0,10,0.0,50,1
2008-03-01,2005105894,和田竜二,0.0,10,0.0,50,1


馬の過去成績を集計しつつ、前処理の済みの全てのテーブルをマージする処理

In [5]:
# ターゲットエンコーディング時に「馬の成績」として扱う項目
TARGET_COLS = [
        HorseResultsCols.RANK,
        HorseResultsCols.PRIZE,
        HorseResultsCols.RANK_DIFF, 
        'first_corner',
        'final_corner',
        'first_to_rank',
        'first_to_final',
        'final_to_rank',
        'time_seconds'
        ]
# horse_id列と共に、ターゲットエンコーディングの対象にする列
GROUP_COLS = [
        'course_len',
        'race_type',
        HorseResultsCols.PLACE
        ]

data_merger = preprocessing.DataMerger(
        results_processor,
        race_info_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
)
# 処理実行
data_merger.merge()

separating horse results by date


  0%|          | 0/1197 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1197 [00:00<?, ?it/s]

In [6]:
#カテゴリ変数の処理
feature_enginnering = preprocessing.FeatureEngineering(data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

In [7]:
#保存
#tmpは一時保存用のディレクトリ
feature_enginnering.featured_data.to_pickle('data/tmp/featured_data_20260205.pickle')

# 4. 学習

In [None]:
keiba_ai = training.KeibaAIFactory.create(feature_enginnering.featured_data) #モデル作成
keiba_ai.train_with_tuning() #パラメータチューニングをして学習

In [None]:
#特徴量の重要度
keiba_ai.feature_importance()

In [None]:
#ハイパーパラメータの確認
keiba_ai.get_params()

In [None]:
#チューニングしないで学習
#keiba_ai.train_without_tuning()

In [None]:
#モデル保存。models/(実行した日付)/(version_name).pickleに、モデルとデータセットが保存される。
training.KeibaAIFactory.save(keiba_ai, version_name='basemodel_2020_2026')

In [None]:
#モデルロード
keiba_ai = training.KeibaAIFactory.load('models/20260108/basemodel_2020_2026.pickle')
keiba_ai.set_params(keiba_ai.get_params())

# 5. シミュレーション

In [None]:
# 5章（シミュレーション）だけを実行したいときの軽量セットアップ用セル

# カーネル再起動後に 3章・4章を再実行せず、

# 学習済みモデル（UMABAN を特徴量から除外して再学習したもの）と払戻テーブルだけを準備する。



from modules.constants import LocalPaths

from modules import preprocessing, training



# 払戻テーブルの前処理のみ実行（Simulator が参照）

return_processor = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)

In [None]:
%autoreload

In [None]:
#シミュレーターに馬券をセット
simulator = simulation.Simulator(return_processor)

In [None]:
# 必要モジュールの再読み込み（コード修正をNotebookに反映）
%load_ext autoreload
%autoreload 2

import importlib
import sys

# 変更が入りやすい箇所を明示的にreload（未importならスキップ）
for mod_name in [
    'modules.policies._score_policy',
    'modules.policies',
    'modules.training._model_wrapper',
    'modules.training._keiba_ai',
    'modules.training',
]:
    if mod_name in sys.modules:
        importlib.reload(sys.modules[mod_name])

# 参照を張り直す
from modules import policies, training

In [None]:
# スコアテーブルを取得
score_table = keiba_ai.calc_score(keiba_ai.datasets.X_test, policies.StdScorePolicy)

## カテゴリカル特徴量の診断

In [None]:
# 学習時の特徴量を確認
train_features = list(keiba_ai.datasets.X_train.columns)
print(f"学習時の特徴量数: {len(train_features)}")
print(f"学習時の最初の10特徴量: {train_features[:10]}")
print(f"学習時の最後の10特徴量: {train_features[-10:]}")

In [None]:
# 推論時のX_testの特徴量を確認
test_features = list(keiba_ai.datasets.X_test.columns)
print(f"推論時の特徴量数: {len(test_features)}")
print(f"推論時の最初の10特徴量: {test_features[:10]}")
print(f"推論時の最後の10特徴量: {test_features[-10:]}")

In [None]:
# 特徴量の差異を確認
train_set = set(train_features)
test_set = set(test_features)

missing_in_test = train_set - test_set
extra_in_test = test_set - train_set

print(f"\n学習時にあって推論時にない特徴量数: {len(missing_in_test)}")
if missing_in_test:
    print(f"最初の10個: {list(missing_in_test)[:10]}")

print(f"\n推論時にあって学習時にない特徴量数: {len(extra_in_test)}")
if extra_in_test:
    print(f"最初の10個: {list(extra_in_test)[:10]}")

In [None]:
# モデルに登録されているカテゴリカル特徴量を確認
model = keiba_ai._KeibaAI__model_wrapper.lgb_model

# LightGBMモデルのカテゴリカル情報を取得
if hasattr(model, 'booster_'):
    cat_feature = model.booster_.params.get('categorical_feature', None)
    print(f"モデルに登録されているcategorical_feature: {cat_feature}")
    print(f"型: {type(cat_feature)}")
else:
    print("モデルはまだ学習されていません")

In [None]:
# データ型がcategoryのカラムを確認
print("学習データのcategoryカラム:")
train_cat_cols = keiba_ai.datasets.X_train.select_dtypes(include='category').columns.tolist()
print(f"数: {len(train_cat_cols)}")
if train_cat_cols:
    print(f"最初の10個: {train_cat_cols[:10]}")

print("\n推論データのcategoryカラム:")
test_cat_cols = keiba_ai.datasets.X_test.select_dtypes(include='category').columns.tolist()
print(f"数: {len(test_cat_cols)}")
if test_cat_cols:
    print(f"最初の10個: {test_cat_cols[:10]}")

In [None]:
# categoryカラムの差異を確認
train_cat_set = set(train_cat_cols)
test_cat_set = set(test_cat_cols)

cat_missing_in_test = train_cat_set - test_cat_set
cat_extra_in_test = test_cat_set - train_cat_set

print(f"学習時にあって推論時にないcategoryカラム: {len(cat_missing_in_test)}")
if cat_missing_in_test:
    print(f"  -> {list(cat_missing_in_test)[:10]}")

print(f"\n推論時にあって学習時にないcategoryカラム: {len(cat_extra_in_test)}")
if cat_extra_in_test:
    print(f"  -> {list(cat_extra_in_test)[:10]}")

In [None]:
# 共通するcategoryカラムで、カテゴリー値が異なるものを確認
common_cat_cols = train_cat_set & test_cat_set
print(f"共通のcategoryカラム数: {len(common_cat_cols)}\n")

mismatched_categories = []
for col in sorted(common_cat_cols)[:5]:  # 最初の5個だけチェック
    train_cats = set(keiba_ai.datasets.X_train[col].cat.categories)
    test_cats = set(keiba_ai.datasets.X_test[col].cat.categories)
    
    if train_cats != test_cats:
        mismatched_categories.append(col)
        print(f"カラム '{col}' のカテゴリー値が不一致:")
        print(f"  学習時のカテゴリー数: {len(train_cats)}")
        print(f"  推論時のカテゴリー数: {len(test_cats)}")
        print(f"  学習時のみ: {train_cats - test_cats}")
        print(f"  推論時のみ: {test_cats - train_cats}")
        print()

if not mismatched_categories:
    print("チェックした範囲では、カテゴリー値の不一致は見つかりませんでした")

### 根本原因の特定: データ型変換の確認

In [None]:
# _score_policy.py の _calc 関数がデータ型を変換する処理を再現
import pandas as pd

# X_testを取得
X_test_original = keiba_ai.datasets.X_test.copy()

print("=== 元のデータ型 ===")
print(f"category型のカラム数: {len(X_test_original.select_dtypes(include='category').columns)}")
print(f"最初の5つのcategoryカラムのdtype:")
for col in X_test_original.select_dtypes(include='category').columns[:5]:
    print(f"  {col}: {X_test_original[col].dtype}")

# _calc関数内の変換処理を再現
X_converted = X_test_original.copy()
for col in X_converted.columns:
    s = X_converted[col]
    if getattr(s.dtype, 'name', '') == 'category':
        X_converted[col] = s.cat.codes

# float化
X_converted = X_converted.astype(float)

print("\n=== 変換後のデータ型 ===")
print(f"category型のカラム数: {len(X_converted.select_dtypes(include='category').columns)}")
print(f"全てのカラムがfloatに変換された: {all(X_converted.dtypes == 'float64')}")

In [None]:
# 問題: LightGBMは学習時にcategory型で学習しているのに、
# 推論時にfloat型で渡されるとエラーが発生する

# 学習時のデータ型を確認
print("=== 学習時のX_trainのデータ型 ===")
train_cat_count = len(keiba_ai.datasets.X_train.select_dtypes(include='category').columns)
print(f"category型のカラム数: {train_cat_count}")

# モデルが学習時に認識したcategorical_featureを確認
if hasattr(model, 'booster_'):
    try:
        # LightGBMのboosterからcategorical情報を取得
        cat_indices = model.booster_.params.get('categorical_feature', [])
        print(f"\nモデルに登録されているcategorical_feature: {cat_indices}")
        print(f"型: {type(cat_indices)}")
        
        # 'auto'の場合、学習時にcategory型だったカラムが自動検出される
        if cat_indices == 'auto' or cat_indices is None:
            print("\n⚠️ categorical_feature='auto' または None")
            print("   学習時にcategory型のカラムを自動検出している可能性が高い")
            print(f"   学習時のcategory型カラム数: {train_cat_count}")
    except Exception as e:
        print(f"エラー: {e}")

In [None]:
# 解決策の確認: category型を保持したまま予測できるか試す
print("=== 直接予測テスト ===")

# テスト1: category型のままpredict_proba
try:
    X_test_subset = keiba_ai.datasets.X_test.head(10)
    predictions = model.predict_proba(X_test_subset)
    print("✓ category型のまま予測成功")
except Exception as e:
    print(f"✗ category型での予測失敗: {type(e).__name__}")
    print(f"  メッセージ: {str(e)[:200]}")

# テスト2: float型に変換してpredict_proba  
try:
    X_test_float = X_test_subset.copy()
    for col in X_test_float.columns:
        if X_test_float[col].dtype.name == 'category':
            X_test_float[col] = X_test_float[col].cat.codes
    X_test_float = X_test_float.astype(float)
    
    predictions = model.predict_proba(X_test_float)
    print("✓ float型に変換後の予測成功")
except Exception as e:
    print(f"✗ float型での予測失敗: {type(e).__name__}")
    print(f"  メッセージ: {str(e)[:200]}")

## 5.1. 単一threshold

### 5.1.1 単勝馬券

In [None]:
fi = keiba_ai.feature_importance(num_features=300)
fi_j = fi[fi['features'].str.startswith('jockey_')]
fi_j.head(20)


In [None]:
# セルを新規で作って実行
feat = feature_enginnering.featured_data
[c for c in feat.columns if c.startswith('jockey_')], feat.filter(like='jockey_').head()

In [None]:
# 単勝シミュレーション: T_RANGE と actions サイズ、騎手特徴量の確認用デバッグ

T_RANGE = [0.0, 3.5]
print('T_RANGE =', T_RANGE)

print('score_table columns (head):', score_table.columns[:10].tolist())

print('\nscore_table[jockey関連列] の例:')

j_cols = [c for c in score_table.columns if c.startswith('jockey_')]

print(j_cols[:10])

if j_cols:

    display(score_table[j_cols].describe())



print('\nscore 分布:')

display(score_table['score'].describe())



print('\nthreshold=0.0 での actions 件数:')

from modules.policies import BetPolicyTansho

actions_debug = keiba_ai.decide_action(score_table, BetPolicyTansho, threshold=0.0)

print('len(actions_debug) =', len(actions_debug))

In [None]:
import traceback

T_RANGE = [0.0, 3.5]
N_SAMPLES = 100
returns = {}

# スコアテーブルを一度だけ計算しておく
score_table = keiba_ai.calc_score(keiba_ai.datasets.X_test, policies.StdScorePolicy)

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGE の範囲を N_SAMPLES 個に分割し、0.0〜3.5 を両端含めてスイープ
    if N_SAMPLES > 1:
        threshold = T_RANGE[0] + (T_RANGE[1] - T_RANGE[0]) * i / (N_SAMPLES - 1)
    else:
        threshold = T_RANGE[0]
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
            score_table,              # スコアテーブル
            policies.BetPolicyTansho, # 賭け方の方針
            threshold=threshold       # 「馬の勝ちやすさスコア」の閾値
        )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception:
        traceback.print_exc()
        break

returns_df = pd.DataFrame.from_dict(returns, orient='index').sort_index()
returns_df.index.name = 'threshold'

In [None]:
#シミュレーション結果も、models/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20260108/tansho.pickle')

In [None]:
#回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='tansho')

In [None]:
# 過去の回収率のみをロードしてプロット
old_returns_df = pd.read_pickle('models/20260103/tansho.pickle')

T_RANGE = [0.0, 3.5]
N_SAMPLES = 100
returns = {}

# 回収率をプロット
simulation.plot_single_threshold(old_returns_df, N_SAMPLES, label='tansho')

In [None]:
import matplotlib.pyplot as plt

def plot_single_threshold_compare(old_returns_df, returns_df, N_SAMPLES, label1='old_tansho', label2='new_tansho'):
    plt.figure(dpi=100)
    # old_returns_dfの標準偏差で幅をつけて薄くプロット
    plt.fill_between(
        old_returns_df.index,
        y1=old_returns_df['return_rate']-old_returns_df['std'],
        y2=old_returns_df['return_rate']+old_returns_df['std'],
        alpha=0.3
        )
    # old_returns_dfの回収率を実線でプロット
    plt.plot(old_returns_df.index, old_returns_df['return_rate'], label=label1)

    # returns_dfの標準偏差で幅をつけて薄くプロット
    plt.fill_between(
        returns_df.index,
        y1=returns_df['return_rate']-returns_df['std'],
        y2=returns_df['return_rate']+returns_df['std'],
        alpha=0.3
        )
    # returns_dfの回収率を実線でプロット
    plt.plot(returns_df.index, returns_df['return_rate'], label=label2)

    # labelで設定した凡例を表示させる
    plt.legend()
    # グリッドをつける
    plt.grid(True)
    plt.xlabel('threshold')
    plt.ylabel('return_rate')
    plt.show()

In [None]:
old_returns_df = pd.read_pickle('models/20260103/tansho.pickle')

#old_returns_dfとreturns_dfの結果を重ねてプロットして比較
plot_single_threshold_compare(
    old_returns_df, returns_df, N_SAMPLES,
    label1='old_tansho', label2='new_tansho'
)



In [None]:
# score_table['score'].describe() の出力とreturns_df.index.min()/max() と len(returns_df)を貼るコード
print("score_table['score'] の統計情報:")
display(score_table['score'].describe())
print(f"returns_df index min: {returns_df.index.min()}")
print(f"returns_df index max: {returns_df.index.max()}")
print(f"returns_df length: {len(returns_df)}")

### 5.1.2 複勝馬券

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyFukusho, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20260108/fukusho.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='fukusho')

In [None]:
# 過去の回収率のみをロードしてプロット
old_returns_df = pd.read_pickle('models/20260103/fukusho.pickle')

T_RANGE = [0.0, 3.5]
N_SAMPLES = 100
returns = {}

# 回収率をプロット
simulation.plot_single_threshold(old_returns_df, N_SAMPLES, label='fukusho')

In [None]:
old_returns_df = pd.read_pickle('models/20260103/fukusho.pickle')

#old_returns_dfとreturns_dfの結果を重ねてプロットして比較
plot_single_threshold_compare(
    old_returns_df, returns_df, N_SAMPLES,
    label1='old_fukusho', label2='new_fukusho'
)



### 5.1.3 馬連BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyUmarenBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20260103/umarenbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='umarenbox')

### 5.1.4 馬単BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyUmatanBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20260103/umatanbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='umatanbox')

### 5.1.5 ワイドBOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyWideBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20260103/widebox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='widebox')

In [None]:
# 過去の回収率のみをロードしてプロット
old_returns_df = pd.read_pickle('models/20260103/widebox.pickle')

T_RANGE = [0.0, 3.5]
N_SAMPLES = 100
returns = {}

# 回収率をプロット
simulation.plot_single_threshold(old_returns_df, N_SAMPLES, label='widebox')

### 5.1.6 三連複BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicySanrenpukuBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20260108/sanrenpukubox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='sanrenpukubox')

### 5.1.7 三連単BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicySanrentanBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20260103/sanrentanbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='sanrentanbox')

In [None]:
# 過去の回収率のみをロードしてプロット
old_returns_df = pd.read_pickle('models/20260103/sanrenpukubox.pickle')

T_RANGE = [0.0, 3.5]
N_SAMPLES = 100
returns = {}

# 回収率をプロット
simulation.plot_single_threshold(old_returns_df, N_SAMPLES, label='sanrenpukubox')

## 5.2. 複数馬券
未実装

## 5.3. 複数threshold
未実装だが、以下のようなコードになる予定。

In [None]:
T1_RANGE = [2.5, 3.5]
MIN_T2 = 1
N_SAMPLES = 10

returns = {}
#「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
idx = 0
for i in tqdm(range(N_SAMPLES)):
    #T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold1 = T1_RANGE[1] * i / N_SAMPLES + T1_RANGE[0] * (1-(i/N_SAMPLES))
    for j in range(N_SAMPLES):
        #MIN_T2からthreshold1までをN_SAMPLES等分
        threshold2 = threshold1 * j / N_SAMPLES + MIN_T2 * (1-(j/N_SAMPLES))
        try:
            #print(threshold1, threshold2)
            #賭ける馬券を決定
            actions = keiba_ai.decide_action(
                    score_table, # スコアテーブル
                    policies.BetPolicyTanshoFukusho, # 賭け方の方針(未実装)
                    threshold1=threshold1, #「馬の勝ちやすさスコア」の閾値
                    threshold2=threshold2
                    )
            returns[idx] = simulator.calc_returns(actions)
            idx += 1
        except Exception as e:
            print(e)
            break
returns_df = pd.DataFrame.from_dict(returns, orient='index')

In [None]:
simulation.plot_single_threshold(returns_df.reset_index(), 100, label='tansho_fukusho')

# 6. 当日の予想
例として2022年1月8日のレースを実際に予想する場合を考える。  
https://race.netkeiba.com/top/race_list.html?kaisai_date=20220108

## 6.1. 前日準備

In [None]:
%autoreload

In [None]:
race_id_list = preparing.scrape_race_id_list(['20260112']) #レースidを取得
len(race_id_list)

In [None]:
# モジュール変更をNotebookに反映（スクレイピング周りの修正後はここを実行）
import importlib
import modules.preparing._scrape_shutuba_table as _sst
import modules.preparing._scrape_html as _sh
import modules.preparing as preparing
importlib.reload(_sst)
importlib.reload(_sh)
importlib.reload(preparing)

In [None]:
#出走するhorse_idの取得
horse_id_list = preparing.scrape_horse_id_list(race_id_list)
len(horse_id_list)

In [None]:
#horseページのhtmlをスクレイピング
#直近レースが更新されている可能性があるので、skip=Falseにして上書きする
html_files_horse = preparing.scrape_html_horse_with_master(horse_id_list, skip=False)

In [None]:
#horse_infoテーブルの更新
horse_info_20250920 = preparing.get_rawdata_horse_info(html_files_horse)
preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info_20250920)

In [None]:
#horse_resultsテーブルの更新
horse_results_20250920 = preparing.get_rawdata_horse_results(html_files_horse)
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_20250920)

In [None]:
#更新後のhorse_infoテーブルの確認
horse_info_processor = preprocessing.HorseInfoProcessor(filepath=LocalPaths.RAW_HORSE_INFO_PATH)
display(horse_info_processor.raw_data.tail())
len(horse_info_processor.raw_data)

In [None]:
#更新後のhorse_resultsテーブルの確認
horse_results_processor = preprocessing.HorseResultsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
display(horse_results_processor.raw_data.tail())
len(horse_results_processor.raw_data)

In [None]:
#pedsテーブルの更新
html_files_peds = preparing.scrape_html_ped(horse_id_list, skip=False)
peds_20250920 = preparing.get_rawdata_peds(html_files_peds)
preparing.update_rawdata(LocalPaths.RAW_PEDS_PATH, peds_20250920)

In [None]:
#processorの更新
horse_info_processor = preprocessing.HorseInfoProcessor(
    filepath=LocalPaths.RAW_HORSE_INFO_PATH)
horse_results_processor = preprocessing.HorseResultsProcessor(
    filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
peds_processor = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)

In [None]:
#モデルの準備
keiba_ai = training.KeibaAIFactory.load('models/20260108/basemodel_2020_2025.pickle')

In [None]:
# --- 診断: モデルのfeature_name_状態を確認 ---
import numpy as np
model = keiba_ai._KeibaAI__model_wrapper.lgb_model
expected_from_attr = getattr(model, 'n_features_in_', None)
expected_from_booster = None
try:
    expected_from_booster = model.booster_.num_feature()
except Exception:
    pass
print('expected_n_features:', expected_from_attr, expected_from_booster)
fn = getattr(model, 'feature_name_', None)
print('feature_name_ type:', type(fn))
try:
    print('len(feature_name_):', None if fn is None else len(fn))
except Exception as e:
    print('len(feature_name_) failed:', e, 'value:', fn)
try:
    print('feature_name_ head:', None if fn is None else list(fn)[:10])
except Exception as e:
    print('list(feature_name_) failed:', e)

print('datasets.X_train n_features:', keiba_ai.datasets.X_train.shape[1])
print('X_train head:', list(keiba_ai.datasets.X_train.columns)[:10])

# 手動でfeature_name_を書き換えできるかテスト
try:
    model.feature_name_ = list(keiba_ai.datasets.X_train.columns)
    print('manually set feature_name_ ok')
    print('feature_name_ head (after set):', list(model.feature_name_)[:10])
except Exception as e:
    print('manually set feature_name_ failed:', repr(e))

## 6.2. 前日全レース予想

In [None]:
# 変更点: %autoreload がWindowsの既定エンコーディング(cp932)で読み込み失敗することがあるため、明示的にreload
import importlib
import modules.policies._score_policy as _score_policy_mod
import modules.policies as _policies_mod
import modules.training._model_wrapper as _model_wrapper_mod
import modules.training._keiba_ai as _keiba_ai_mod
import modules.training._keiba_ai_factory as _keiba_ai_factory_mod
import modules.training as _training_mod

importlib.reload(_score_policy_mod)
importlib.reload(_policies_mod)
importlib.reload(_model_wrapper_mod)
importlib.reload(_keiba_ai_mod)
importlib.reload(_keiba_ai_factory_mod)
importlib.reload(_training_mod)

from modules import policies as policies
from modules import training as training
score_policy = policies.StdScorePolicy
print('reloaded policies/training (explicit)')

In [None]:
# 前日全レース予想用のレースidとレース発走時刻を取得
target_race_id_list, target_race_time_list = preparing.scrape_race_id_race_time_list('20260111')
print(len(target_race_id_list))
print(len(target_race_time_list))
yesterday = True

In [None]:
# ターゲットエンコーディング時に「馬の成績」として扱う項目
TARGET_COLS = [
        HorseResultsCols.RANK,
        HorseResultsCols.PRIZE,
        HorseResultsCols.RANK_DIFF, 
        'first_corner',
        'final_corner',
        'first_to_rank',
        'first_to_final',
        'final_to_rank',
        'time_seconds'
        ]
# horse_id列と共に、ターゲットエンコーディングの対象にする列
GROUP_COLS = [
        'course_len',
        'race_type',
        HorseResultsCols.PLACE
        ]

In [None]:
# --- 診断: 特徴量の一致率とスコアの分散を1レースで確認 ---
import numpy as np
import pandas as pd

# 例として最初のレースを対象（必要なら race_id をここで上書き）
race_id_debug = target_race_id_list[10]
race_time_debug = target_race_time_list[10]
filepath = 'data/tmp/shutuba_debug.pickle'
today = '2026/01/09'  # 前日予測の取得日（実運用に合わせる）

preparing.scrape_shutuba_table(race_id_debug, today, filepath)

if yesterday:
    pd2 = pd.read_pickle(filepath)
    pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
    if 'weather' not in pd2.columns or pd2['weather'].isnull().all():
        pd2['weather'] = '晴'
    if 'ground_state' not in pd2.columns or pd2['ground_state'].isnull().all():
        pd2['ground_state'] = '良'
    pd2.to_pickle(filepath)

shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)
shutuba_data_merger = preprocessing.ShutubaDataMerger(
    shutuba_table_processor,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS,
 )
shutuba_data_merger.merge()

feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

X_debug = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

# 推論時の列数不一致対策：学習時列に合わせて0埋め整列してからpredict
train_cols = list(keiba_ai.datasets.X_train.columns)
X_debug_aligned = X_debug.reindex(columns=train_cols, fill_value=0)
if ResultsCols.UMABAN in X_debug.columns:
    X_debug_aligned[ResultsCols.UMABAN] = X_debug[ResultsCols.UMABAN]

model = keiba_ai._KeibaAI__model_wrapper.lgb_model
score_debug = score_policy.calc(model, X_debug_aligned)
print('debug race_id:', race_id_debug, 'time:', race_time_debug)
print('score nunique:', score_debug['score'].nunique())
display(score_debug.sort_values('score', ascending=False).head(16))

In [None]:
# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
today = '2022/12/27'

for race_id, race_time in zip(target_race_id_list, target_race_time_list):
    # 出馬表の取得
    preparing.scrape_shutuba_table(race_id, today, filepath)

    # 前日予想の場合
    if yesterday:
        # 前日予想の場合、馬体重を0（0）に補正
        pd2 = pd.read_pickle(filepath)
        pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
        # 前日予想の場合、天候と馬場状態が公開されていない場合はこちらを有効にする
        #pd2['weather'] = '晴'
        #pd2['ground_state'] = '良'
        pd2.to_pickle(filepath)

    # 出馬表の加工
    shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)

    # テーブルのマージ
    shutuba_data_merger = preprocessing.ShutubaDataMerger(
        shutuba_table_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
    )
    shutuba_data_merger.merge()

    # 特徴量エンジニアリング
    feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
        .add_interval()\
        .add_agedays()\
        .dumminize_ground_state()\
        .dumminize_race_type()\
        .dumminize_sex()\
        .dumminize_weather()\
        .encode_horse_id()\
        .encode_jockey_id()\
        .encode_trainer_id()\
        .encode_owner_id()\
        .encode_breeder_id()\
        .dumminize_kaisai()\
        .dumminize_around()\
        .dumminize_race_class()

    # 予測
    X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

    # 当日の出走情報テーブル（前処理前）
    df_tmp = shutuba_table_processor.raw_data[:1]

    i = 0
    for num in list(Master.PLACE_DICT.values()):
        if num == race_id[4:6]:
            print(list(Master.PLACE_DICT)[i] + race_id[10:12] + 'R ' + race_time + '発走 ' + str(df_tmp.iat[0, 12])
                + str(df_tmp.iat[0, 10]) + 'm ' + str(df_tmp.iat[0, 13]) + ' ' + str(df_tmp.iat[0, 15]))
            break
        i += 1

    print(keiba_ai.calc_score(X, policies.StdScorePolicy).sort_values('score', ascending=False))

In [None]:
# ============================================================================
# 全レース前日予測（特徴量整列は keiba_ai.calc_score に委譲）
# ============================================================================

import time
import numpy as np
import pandas as pd

print("=== 全レース前日予測開始 ===")
print(f"対象レース数: {len(target_race_id_list)}")
print(f"前日予想モード: {'ON' if yesterday else 'OFF'}")
print("=" * 50)

# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
today = '2025/12/26'

# 全レースの予測結果を格納
all_predictions = {}
error_count = 0

for idx, (race_id, race_time) in enumerate(zip(target_race_id_list, target_race_time_list), 1):
    try:
        print(f"\n[{idx}/{len(target_race_id_list)}] レース処理中: {race_id}")
        
        # サーバー負荷軽減（必須）
        time.sleep(1)
        
        # 出馬表の取得
        preparing.scrape_shutuba_table(race_id, today, filepath)

        # 前日予想の場合
        if yesterday:
            # 前日予想の場合、馬体重を0（0）に補正
            pd2 = pd.read_pickle(filepath)
            pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
            # 前日予想の場合、天候と馬場状態が公開されていない場合はデフォルト値を設定
            if 'weather' not in pd2.columns or pd2['weather'].isnull().all():
                pd2['weather'] = '晴'
            if 'ground_state' not in pd2.columns or pd2['ground_state'].isnull().all():
                pd2['ground_state'] = '良'
            pd2.to_pickle(filepath)

        # 出馬表の加工
        shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)

        # テーブルのマージ
        shutuba_data_merger = preprocessing.ShutubaDataMerger(
            shutuba_table_processor,
            horse_results_processor,
            horse_info_processor,
            peds_processor,
            target_cols=TARGET_COLS,
            group_cols=GROUP_COLS
)
        shutuba_data_merger.merge()

        # 特徴量エンジニアリング
        feature_enginnering_shutuba = (
            preprocessing.FeatureEngineering(shutuba_data_merger)
            .add_interval()
            .add_agedays()
            .dumminize_ground_state()
            .dumminize_race_type()
            .dumminize_sex()
            .dumminize_weather()
            .encode_horse_id()
            .encode_jockey_id()
            .encode_trainer_id()
            .encode_owner_id()
            .encode_breeder_id()
            .dumminize_kaisai()
            .dumminize_around()
            .dumminize_race_class()
        )

        # 予測（整列・型変換は calc_score 側のポリシーで吸収）
        X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1, errors='ignore')

        # 当日の出走情報テーブル（前処理前）
        df_tmp = shutuba_table_processor.raw_data[:1]

        # レース情報の表示
        race_info = ""
        for place_name, num in Master.PLACE_DICT.items():
            if num == race_id[4:6]:
                race_info = (
                    f"{place_name}{race_id[10:12]}R {race_time}発走 "
                    f"{df_tmp.iat[0, 12]}{df_tmp.iat[0, 10]}m "
                    f"{df_tmp.iat[0, 13]} {df_tmp.iat[0, 15]}"
                )
                print(race_info)
                break

        # 予測実行
        score_result = keiba_ai.calc_score(X, score_policy).sort_values('score', ascending=False)
        print("score nunique:", score_result['score'].nunique())

        # 上位馬のみを表示（簡潔化）
        top_horses = score_result.head(5)
        print("TOP5予想:")
        for rank, (_, row) in enumerate(top_horses.iterrows(), 1):
            print(f"  {rank}位: {row['馬番']}番 (スコア: {row['score']:.3f})")

        # 結果を保存
        all_predictions[race_id] = {
            'race_info': race_info,
            'predictions': score_result,
            'race_time': race_time
        }

        print(f"✅ {race_id} 予測完了")

    except Exception as e:
        error_count += 1
        print(f"❌ {race_id} 予測エラー: {str(e)}")
        # エラーが発生したレースをスキップして続行
        continue

print(f"\n=== 全レース予測完了 ===")
print(f"成功: {len(all_predictions)}/{len(target_race_id_list)} レース")
print(f"エラー: {error_count} レース")

# 最終結果のサマリー表示
if all_predictions:
    print(f"\n=== 本日の予想結果一覧 ===")
    for race_id, result in all_predictions.items():
        print(f"\n{result['race_info']}")
        top3 = result['predictions'].head(3)

        for rank, (_, row) in enumerate(top3.iterrows(), 1):
            print(f"  {rank}位予想: {row['馬番']}番 (スコア: {row['score']:.3f})")

    print(f"\n🎯 {len(all_predictions)}レースの予測が完了しました！")
else:
    print("❌ 予測に成功したレースがありません。")

## 6.3. レース直前データ処理（当日レース予想）

In [None]:
%autoreload

In [None]:
# 馬体重の発表されたレースID、レース時刻を取得（レース当日用）
target_race_id_list, target_race_time_list = preparing.create_active_race_id_list()

# レース時刻順にソート
race_data = list(zip(target_race_id_list, target_race_time_list))
race_data_sorted = sorted(race_data, key=lambda x: x[1])  # 時刻でソート
target_race_id_list = [race_id for race_id, race_time in race_data_sorted]
target_race_time_list = [race_time for race_id, race_time in race_data_sorted]

print("ソート後のレースID:", target_race_id_list)
print("ソート後のレース時刻:", target_race_time_list)

In [None]:
# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
#today = '2022/10/01'
today = datetime.datetime.now().date().strftime('%Y/%m/%d')

for race_id, race_time in zip(target_race_id_list, target_race_time_list):
    # 出馬表の取得
    preparing.scrape_shutuba_table(race_id, today, filepath)

    # 出馬表の加工
    shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)
    # 馬番クリーンアップを含む前処理を実行
    # shutuba_table_processor.process()

    # テーブルのマージ
    shutuba_data_merger = preprocessing.ShutubaDataMerger(
        shutuba_table_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
    )
    shutuba_data_merger.merge()

    # 特徴量エンジニアリング
    feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
        .add_interval()\
        .add_agedays()\
        .dumminize_ground_state()\
        .dumminize_race_type()\
        .dumminize_sex()\
        .dumminize_weather()\
        .encode_horse_id()\
        .encode_jockey_id()\
        .encode_trainer_id()\
        .encode_owner_id()\
        .encode_breeder_id()\
        .dumminize_kaisai()\
        .dumminize_around()\
        .dumminize_race_class()

    # 予測
    X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

    # 当日の出走情報テーブル（前処理前）
    df_tmp = shutuba_table_processor.raw_data[:1]
    #df_tmp['race_type'] tmp.iat[0, 12]
    #df_tmp['around'] 13
    #df_tmp['weather'] 14
    #df_tmp['ground_state'] 15
    #df_tmp['race_class']16

    i = 0
    for num in list(Master.PLACE_DICT.values()):
        if num == race_id[4:6]:
            print(list(Master.PLACE_DICT)[i] + race_id[10:12] + 'R ' + race_time + '発走 ' + str(df_tmp.iat[0, 12])
                + str(df_tmp.iat[0, 10]) + 'm ' + str(df_tmp.iat[0, 13]) + ' ' + str(df_tmp.iat[0, 15]))
            break
        i += 1

    print(keiba_ai.calc_score(X, policies.StdScorePolicy).sort_values('score', ascending=False))

In [None]:
# 本番では予測してなかったけど、400万馬券が出たレースのスコアがどうなってたか確認する（セル163の単レース版）
import datetime
import time
from pathlib import Path

# 確認したいレースID（例: 202608010307）
race_id_debug = '202608010503'

# (任意) return_tables があれば「三連単 400万円以上」のレース候補を表示する
return_tables_path = None
for p in [Path('data/raw/return_tables.pickle'), Path('data/raw/return_tables.pickle.bak')]:
    if p.exists():
        return_tables_path = p
        break

if return_tables_path is not None:
    try:
        return_processor = preprocessing.ReturnProcessor(str(return_tables_path))
        return_dict = return_processor.preprocessed_data
        sanrentan = return_dict.get('sanrentan')
        if sanrentan is not None and 'return' in sanrentan.columns:
            high = sanrentan[sanrentan['return'] >= 4_000_000].sort_values('return', ascending=False)
            if len(high) > 0:
                print('=== 三連単 400万円以上のレース候補（上位10）===')
                display(high.head(10))
            if race_id_debug and str(race_id_debug) in sanrentan.index.astype(str):
                payout = sanrentan.loc[sanrentan.index.astype(str) == str(race_id_debug), 'return'].iloc[0]
                print(f"レース {race_id_debug} 三連単払戻: {int(payout):,}円")
    except Exception as e:
        print(f"return_tables 解析スキップ: {e}")

if not race_id_debug:
    raise ValueError('race_id_debug が空です。レースIDを設定してください。')

# race_id から日付を推定（yyyy/mm/dd）
debug_date = f"{race_id_debug[:4]}/{race_id_debug[4:6]}/{race_id_debug[6:8]}"

# 出馬表の取得（過去レースでも参照できる場合あり）
filepath_debug = 'data/tmp/shutuba_debug.pickle'
time.sleep(1)  # サーバー負荷軽減（1秒間隔）
preparing.scrape_shutuba_table(race_id_debug, debug_date, filepath_debug)

# 出馬表の加工 + マージ + 特徴量（セル163と同じ流れ）
shutuba_table_processor_debug = preprocessing.ShutubaTableProcessor(filepath_debug)
shutuba_data_merger_debug = preprocessing.ShutubaDataMerger(
    shutuba_table_processor_debug,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS,
 )
shutuba_data_merger_debug.merge()

feature_enginnering_debug = (
    preprocessing.FeatureEngineering(shutuba_data_merger_debug)
    .add_interval()
    .add_agedays()
    .dumminize_ground_state()
    .dumminize_race_type()
    .dumminize_sex()
    .dumminize_weather()
    .encode_horse_id()
    .encode_jockey_id()
    .encode_trainer_id()
    .encode_owner_id()
    .encode_breeder_id()
    .dumminize_kaisai()
    .dumminize_around()
    .dumminize_race_class()
 )

X_debug = feature_enginnering_debug.featured_data.drop(['date'], axis=1)

# レース概要表示（列名ベースで安全に）
info_row = shutuba_table_processor_debug.raw_data.iloc[0]
place_code = str(race_id_debug)[4:6]
place_name = next((k for k, v in Master.PLACE_DICT.items() if v == place_code), place_code)
race_no = int(str(race_id_debug)[10:12])
print(f"=== {race_id_debug} ({debug_date}) {place_name}{race_no}R ===")
print(
    f"{info_row.get('race_type','')} {info_row.get('course_len','')}m "
    f"{info_row.get('around','')} {info_row.get('ground_state','')} / "
    f"{info_row.get('weather','')} / {info_row.get('race_class','')}"
 )

# 予測スコア（馬ごと）
score_df_debug = keiba_ai.calc_score(X_debug, policies.StdScorePolicy).sort_values('score', ascending=False)
print('=== 予測スコア ===')
display(score_df_debug)

# (任意) 実際の着順との突合（results.pickle があれば）
results_path = None
for p in [Path('data/raw/results.pickle'), Path('data/raw/results.pickle.bak')]:
    if p.exists():
        results_path = p
        break

if results_path is not None:
    try:
        import pandas as pd
        results_raw = pd.read_pickle(results_path)
        if str(race_id_debug) in results_raw.index.astype(str):
            actual = results_raw.loc[str(race_id_debug)].copy()
            # race_id 単位で複数行になる想定（出走頭数ぶん）
            if isinstance(actual, pd.Series):
                actual = actual.to_frame().T
            cols_want = [c for c in ['着順', '馬番', '馬名', '単勝', '人気'] if c in actual.columns]
            if '着順' in actual.columns:
                actual['着順'] = pd.to_numeric(actual['着順'], errors='coerce')
                actual = actual.sort_values('着順')
            merged = actual[cols_want].merge(
                score_df_debug[['馬番', 'score']],
                on='馬番',
                how='left',
            )
            print('=== 実着順 × 予測スコア（先頭10頭）===')
            display(merged.head(10))
    except Exception as e:
        print(f"results 突合スキップ: {e}")

## 6.4. レース直前データ処理（旧方式）

In [None]:
filepath = 'data/tmp/shutuba.pickle' #一時的に出馬表を保存するパスを指定
preparing.scrape_shutuba_table(race_id_list[0], '2025/9/21', filepath) #馬体重が発表されたら、出馬表を取得
shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath) #出馬表の加工

In [None]:
#テーブルのマージ
shutuba_data_merger = preprocessing.ShutubaDataMerger(
    shutuba_table_processor,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS
)

shutuba_data_merger.merge()

In [None]:
#特徴量エンジニアリング
feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

In [None]:
# 予測（学習列に完全整列＋NaN防止）
from modules.constants import ResultsCols
import numpy as np

# 1) 学習で実際に使った列を取得
train_cols = keiba_ai.datasets.X_train.columns

# 2) 予測用特徴量（date/rankは除外）
X_feat = feature_enginnering_shutuba.featured_data.drop(['date', 'rank'], axis=1, errors='ignore')

# 3) 学習列に揃える（不足は0、余剰は落とす）
X_feat = X_feat.reindex(columns=train_cols, fill_value=0)

# 4) 数値化とNaN/inf対策
for c in X_feat.columns:
    if getattr(X_feat[c].dtype, 'name', '') == 'category':
        X_feat[c] = X_feat[c].cat.codes
X_feat = X_feat.astype(float).replace([np.inf, -np.inf], 0).fillna(0)

# 5) 表示用に馬番を付与（ポリシー側で自動除外）
X_for_policy = X_feat.copy()
if ResultsCols.UMABAN in feature_enginnering_shutuba.featured_data.columns:
    X_for_policy[ResultsCols.UMABAN] = feature_enginnering_shutuba.featured_data[ResultsCols.UMABAN].values

# 6) 予測
score_result = keiba_ai.calc_score(X_for_policy, policies.StdScorePolicy).sort_values('score', ascending=False)
score_result.head()

## 付録
騎手勝率無し VS 有りの比較

In [None]:
old_returns_df = pd.read_pickle('models/20251226/tansho_no_jockey_std_0_3p5.pickle')



# old_returns_df と returns_df の結果を重ねてプロットして比較

plot_single_threshold_compare(

    old_returns_df, returns_df, N_SAMPLES,

    label1='no_jockey(std,0-3.5)', label2='with_jockey(std,0-3.5)'

)




In [None]:
# n_bets / n_races がどの閾値から崩れるか確認（特徴量なし vs あり）

import matplotlib.pyplot as plt



def _plot_counts(df, label, ax_bets, ax_races):

    ax_bets.plot(df.index, df['n_bets'], label=label)

    ax_races.plot(df.index, df['n_races'], label=label)



fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 6), dpi=100, sharex=True)



_plot_counts(old_returns_df, 'no_jockey', ax1, ax2)

_plot_counts(returns_df, 'with_jockey', ax1, ax2)



ax1.set_ylabel('n_bets')

ax1.grid(True)

ax1.legend()



ax2.set_ylabel('n_races')

ax2.set_xlabel('threshold')

ax2.grid(True)

ax2.legend()



plt.show()



print('--- tail(10): no_jockey ---')

display(old_returns_df.tail(10)[['n_bets','n_races','return_rate']])

print('--- tail(10): with_jockey ---')

display(returns_df.tail(10)[['n_bets','n_races','return_rate']])


In [None]:
# n_races >= 100 に限定した return_rate の最大値（特徴量なし vs あり）

import numpy as np



MIN_RACES = 100



def best_under_constraint(df, min_races: int):

    d = df[df['n_races'] >= min_races].copy()

    if len(d) == 0:

        return None, None, d

    best_thr = float(d['return_rate'].idxmax())

    best_rr = float(d.loc[best_thr, 'return_rate'])

    return best_thr, best_rr, d



thr0, rr0, d0 = best_under_constraint(old_returns_df, MIN_RACES)

thr1, rr1, d1 = best_under_constraint(returns_df, MIN_RACES)



print(f'MIN_RACES = {MIN_RACES}')

print('--- no_jockey ---')

if thr0 is None:

    print('条件を満たすthresholdがありません')

else:

    print('best threshold:', thr0)

    print('best return_rate:', rr0)

    display(old_returns_df.loc[[thr0], ['n_bets','n_races','return_rate','std']])



print('--- with_jockey ---')

if thr1 is None:

    print('条件を満たすthresholdがありません')

else:

    print('best threshold:', thr1)

    print('best return_rate:', rr1)

    display(returns_df.loc[[thr1], ['n_bets','n_races','return_rate','std']])



# 参考: 上位5件も表示

if len(d0) > 0:

    print('top5(no_jockey)')

    display(d0.sort_values('return_rate', ascending=False).head(5)[['n_bets','n_races','return_rate','std']])

if len(d1) > 0:

    print('top5(with_jockey)')

    display(d1.sort_values('return_rate', ascending=False).head(5)[['n_bets','n_races','return_rate','std']])


In [None]:
# 上位帯の安定性チェック: MIN_RACES を変えてベストを比較

import pandas as pd



MIN_RACES_LIST = [100, 200, 500]



def best_row(df: pd.DataFrame, min_races: int, label: str) -> dict:

    d = df[df['n_races'] >= min_races]

    if len(d) == 0:

        return {

            'model': label,

            'min_races': min_races,

            'best_threshold': None,

            'best_return_rate': None,

            'n_races': 0,

            'n_bets': 0,

            'std': None,

        }

    thr = float(d['return_rate'].idxmax())

    row = df.loc[thr]

    return {

        'model': label,

        'min_races': min_races,

        'best_threshold': thr,

        'best_return_rate': float(row['return_rate']),

        'n_races': int(row['n_races']),

        'n_bets': int(row['n_bets']),

        'std': float(row['std']),

    }



rows = []

for m in MIN_RACES_LIST:

    rows.append(best_row(old_returns_df, m, 'no_jockey'))

    rows.append(best_row(returns_df, m, 'with_jockey'))



stability_df = pd.DataFrame(rows).sort_values(['min_races', 'model']).reset_index(drop=True)

display(stability_df)


In [None]:
# 運用向けチェック: min_races=500 の範囲で return_rate > 1.0 は存在するか

import pandas as pd



MIN_RACES_OPS = 500

RR_TARGET = 1.0



def points_over_1(df: pd.DataFrame, label: str, min_races: int, rr_target: float):

    d = df[df['n_races'] >= min_races].copy()

    over = d[d['return_rate'] > rr_target].copy()

    print(f'[{label}] min_races>={min_races} の点数: {len(d)}')

    print(f'[{label}] return_rate>{rr_target} の点数: {len(over)}')

    if len(over) == 0:

        return

    print(f'[{label}] threshold 範囲: {float(over.index.min())} 〜 {float(over.index.max())}')

    display(over.sort_values('return_rate', ascending=False).head(10)[['n_bets','n_races','return_rate','std']])



points_over_1(old_returns_df, 'no_jockey', MIN_RACES_OPS, RR_TARGET)

points_over_1(returns_df, 'with_jockey', MIN_RACES_OPS, RR_TARGET)


In [None]:
# === 旧モデル(特徴量なし)を新仕様(StdScorePolicy)で再計算して保存 ===

import os

import traceback

import pandas as pd

from tqdm import tqdm



from modules import training, policies, preprocessing, simulation

from modules.constants import LocalPaths



# 旧モデル（特徴量なし）をロード

keiba_ai_no_jockey = training.KeibaAIFactory.load('models/20251223/basemodel_2020_2025.pickle')

keiba_ai_no_jockey.set_params(keiba_ai_no_jockey.get_params())



# Simulator / ReturnProcessor が未準備なら用意

try:

    simulator

except NameError:

    return_processor = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)

    simulator = simulation.Simulator(return_processor)



# 0.0〜3.5 を両端含めてスイープ

T_RANGE_OLD = [0.0, 3.5]

N_SAMPLES_OLD = 100



score_table_old = keiba_ai_no_jockey.calc_score(keiba_ai_no_jockey.datasets.X_test, policies.StdScorePolicy)



returns_old = {}

for i in tqdm(range(N_SAMPLES_OLD)):

    if N_SAMPLES_OLD > 1:

        threshold = T_RANGE_OLD[0] + (T_RANGE_OLD[1] - T_RANGE_OLD[0]) * i / (N_SAMPLES_OLD - 1)

    else:

        threshold = T_RANGE_OLD[0]

    try:

        actions_old = keiba_ai_no_jockey.decide_action(

            score_table_old,

            policies.BetPolicyTansho,

            threshold=threshold,

        )

        returns_old[threshold] = simulator.calc_returns(actions_old)

    except Exception:

        traceback.print_exc()

        break



returns_old_df = pd.DataFrame.from_dict(returns_old, orient='index').sort_index()

returns_old_df.index.name = 'threshold'



os.makedirs('models/20251226', exist_ok=True)

returns_old_path = 'models/20251226/tansho_no_jockey_std_0_3p5.pickle'

returns_old_df.to_pickle(returns_old_path)



print('saved:', returns_old_path)

print('index min/max/len:', float(returns_old_df.index.min()), float(returns_old_df.index.max()), len(returns_old_df))

returns_old_df.head()


## 6.5. 過去日（2025/12/21）の当日予想→券種別回収率シミュレーション

このセクションは 6.3 の「当日予想」セルと同じ処理（出馬表→結合→特徴量→スコア）を、**過去日**の指定 race_id に対して実行し、指定ルールで馬券を買ったと仮定した回収率を計算します。

注意:
- `DataMerger` 側の `date < 対象日` フィルタ（馬の過去成績集計）に依存してリークを避けます。
- ただし **使用モデルが対象日のデータを学習に含んでいる場合**、評価は楽観的になり得ます。
- 払戻テーブル（return_tables）に race_id が無いと、そのレースは集計から除外されます。

In [None]:
import os
import time
import math
import numpy as np
import pandas as pd

from modules import preparing, preprocessing, policies, training, simulation
from modules.constants import LocalPaths, ResultsCols

# --- 対象レース（2025/12/21 全12R x 3開催 = 36レース） ---
SIM_DATE_STR = '2025/12/21'  # scrape_shutuba_table の date 引数（yyyy/mm/dd）
BASE_RACE_IDS = [
    '202506050601',
    '202509050601',
    '202507050601',
]

race_id_list = []
for base in BASE_RACE_IDS:
    prefix = base[:-2]  # 末尾"01"を除いた部分
    race_id_list.extend([prefix + f'{i:02d}' for i in range(1, 13)])
race_id_list = sorted(set(race_id_list))
print('race_id_list size:', len(race_id_list))

# --- 使用モデル ---
MODEL_PATH = 'models/20251226/basemodel_2020_2025.pickle'
if not os.path.exists(MODEL_PATH):
    # 実在する basemodel を自動選択（ファイル名が変わっても動くようにする）
    candidates = [
        os.path.join('models', '20251226', f)
        for f in os.listdir(os.path.join('models', '20251226'))
        if f.startswith('basemodel_') and f.endswith('.pickle')
    ]
    if len(candidates) == 0:
        raise FileNotFoundError('basemodel_*.pickle が models/20251226 に見つかりません')
    MODEL_PATH = sorted(candidates)[-1]
    print('[WARN] 指定モデルが無いため自動選択:', MODEL_PATH)

keiba_ai = training.KeibaAIFactory.load(MODEL_PATH)
score_policy = policies.StdScorePolicy

# --- 前処理済みテーブル（最新 raw を使用） ---
horse_results_processor = preprocessing.HorseResultsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
horse_info_processor   = preprocessing.HorseInfoProcessor(filepath=LocalPaths.RAW_HORSE_INFO_PATH)
peds_processor         = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)
return_processor       = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)
simulator              = simulation.Simulator(return_processor)

# 6章の既存変数が無い場合は最低限の空で進める（特徴量が減るだけ）
if 'TARGET_COLS' not in globals():
    TARGET_COLS = []
    print('[WARN] TARGET_COLS が未定義なので空で進めます（特徴量が減ります）。')
if 'GROUP_COLS' not in globals():
    GROUP_COLS = []
    print('[WARN] GROUP_COLS が未定義なので空で進めます（特徴量が減ります）。')

In [None]:
# --- 1) 出馬表スクレイピング（36レース） ---
out_dir = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20251221')
os.makedirs(out_dir, exist_ok=True)

created = 0
skipped = 0
for rid in race_id_list:
    out_path = os.path.join(out_dir, f'{rid}.pickle')
    if os.path.exists(out_path):
        skipped += 1
        continue
    time.sleep(1)  # サーバー負荷軽減
    preparing.scrape_shutuba_table(rid, SIM_DATE_STR, out_path)
    created += 1

print('scrape done. created=', created, 'skipped(existing)=', skipped, 'dir=', out_dir)

In [None]:
# --- 2) 出馬表pickleを結合 → 前処理 → マージ → 特徴量 ---
paths = [os.path.join(out_dir, f'{rid}.pickle') for rid in race_id_list if os.path.exists(os.path.join(out_dir, f'{rid}.pickle'))]
print('available shutuba pickles:', len(paths), '/', len(race_id_list))

if len(paths) == 0:
    raise RuntimeError('出馬表pickleが1件もありません。先にスクレイピングセルを実行してください。')

raw_list = [pd.read_pickle(p) for p in paths]
shutuba_raw = pd.concat(raw_list, axis=0, ignore_index=False)
shutuba_all_path = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20251221_all.pickle')
shutuba_raw.to_pickle(shutuba_all_path)
print('saved:', shutuba_all_path, 'rows=', len(shutuba_raw))

# 出馬表の加工（race_idメタ列を保持するようにProcessor側は修正済み）
shutuba_table_processor = preprocessing.ShutubaTableProcessor(shutuba_all_path)

# テーブルのマージ
shutuba_data_merger = preprocessing.ShutubaDataMerger(
    shutuba_table_processor,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS,
 )
shutuba_data_merger.merge()

# 特徴量エンジニアリング
feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

X_shutuba = feature_enginnering_shutuba.featured_data
print('X_shutuba shape:', X_shutuba.shape)

In [None]:
# --- 2.5) コード修正を反映（モジュールreload） ---
import importlib
import modules.policies._score_policy as _score_policy_mod
import modules.policies as _policies_mod

importlib.reload(_score_policy_mod)
importlib.reload(_policies_mod)

from modules import policies as policies  # 再import
score_policy = policies.StdScorePolicy
print('reloaded policies._score_policy')

In [None]:
# --- 3) スコア算出（レース内標準化） ---
score_table_20251221 = keiba_ai.calc_score(X_shutuba, score_policy)
print('score_table shape:', score_table_20251221.shape)
display(score_table_20251221.head())

# 便利カラム（型）を整える
if ResultsCols.UMABAN in score_table_20251221.columns:
    score_table_20251221[ResultsCols.UMABAN] = pd.to_numeric(score_table_20251221[ResultsCols.UMABAN], errors='coerce').astype('Int64')
score_table_20251221['race_id'] = score_table_20251221['race_id'].astype(str)

In [None]:
# --- 4) 券種別ルールで actions を作る（新ルール準拠） ---
def _umaban_col(df: pd.DataFrame) -> str:
    if ResultsCols.UMABAN in df.columns:
        return ResultsCols.UMABAN
    if 'umaban' in df.columns:
        return 'umaban'
    raise KeyError('umaban列が見つかりません')

def _sorted_umaban_by_score(df_1race: pd.DataFrame) -> list[int]:
    df = df_1race.copy()
    if 'score' not in df.columns:
        raise KeyError('score列が見つかりません')
    df['score'] = pd.to_numeric(df['score'], errors='coerce')
    ucol = _umaban_col(df)
    df[ucol] = pd.to_numeric(df[ucol], errors='coerce')
    df = df.dropna(subset=['score', ucol]).sort_values('score', ascending=False)
    return df[ucol].astype(int).tolist()

def _pick_W(df_1race: pd.DataFrame, low: float = 2.0, high: float = 2.7) -> list[int]:
    df = df_1race.copy()
    df['score'] = pd.to_numeric(df['score'], errors='coerce')
    ucol = _umaban_col(df)
    df[ucol] = pd.to_numeric(df[ucol], errors='coerce')
    df = df.dropna(subset=['score', ucol])
    df = df[(df['score'] >= float(low)) & (df['score'] <= float(high))]
    df = df.sort_values('score', ascending=False)
    return df[ucol].astype(int).tolist()

def _pick_P(df_1race: pd.DataFrame, min_score: float = 1.6, cap_top: int = 3) -> list[int]:
    df = df_1race.copy()
    df['score'] = pd.to_numeric(df['score'], errors='coerce')
    ucol = _umaban_col(df)
    df[ucol] = pd.to_numeric(df[ucol], errors='coerce')
    df = df.dropna(subset=['score', ucol])
    df = df[df['score'] >= float(min_score)].sort_values('score', ascending=False)
    uma = df[ucol].astype(int).tolist()
    # 4頭以上いる場合は上位3頭に絞る（あなたのルール）
    if len(uma) >= 4:
        uma = uma[: int(cap_top)]
    return uma

def _unique_keep_order(xs: list[int]) -> list[int]:
    seen = set()
    out: list[int] = []
    for x in xs:
        if x in seen:
            continue
        seen.add(x)
        out.append(x)
    return out

def _fill_from_sorted(sorted_all: list[int], exclude: list[int], need: int, cap: int | None = None) -> list[int]:
    ex = set(exclude)
    out: list[int] = []
    for u in sorted_all:
        if u in ex:
            continue
        out.append(u)
        ex.add(u)
        if cap is not None and len(out) >= cap:
            break
        if len(out) >= need:
            break
    return out

def build_actions_by_ticket(score_table: pd.DataFrame) -> dict[str, dict]:
    """新ルール準拠の買い目生成（Simulatorの仕様上、BOX/リストで表現）。

    W（勝ち軸候補）: 2.0 <= score <= 2.7
    P（複勝候補）: score >= 1.6（ただし4頭以上なら上位3頭にcap）

    単勝: W がいれば W に賭ける（いなければ見送り）
    複勝: P に賭ける（いなければ見送り）
    馬連: P のBOX（len(P) >= 2）
    ワイド: P のBOX（len(P)==2のときは次点を1頭補完して3頭に）
    馬単: W がいれば、[W上位1頭 + P上位2頭] を候補にしてBOX（順列）
    三連複: P を土台に不足分を次点で補完して最大5頭までBOX（len>=3）
    三連単: W がいれば候補最大4頭まででBOX（順列）（len>=3）
    """
    actions_by_ticket: dict[str, dict] = {
        'tansho': {},
        'fukusho': {},
        'umaren': {},
        'umatan': {},
        'wide': {},
        'sanrenpuku': {},
        'sanrentan': {},
    }
    for rid, df_r in score_table.groupby('race_id'):
        sorted_all = _sorted_umaban_by_score(df_r)
        W = _pick_W(df_r)
        P = _pick_P(df_r)

        # 単勝
        actions_by_ticket['tansho'][rid] = {'tansho': W}

        # 複勝
        actions_by_ticket['fukusho'][rid] = {'fukusho': P}

        # 馬連（BOX）
        actions_by_ticket['umaren'][rid] = {'umaren': P if len(P) >= 2 else []}

        # ワイド（BOX）: Pが2頭なら次点を1頭補完して3頭に
        wide_candidates = P
        if len(wide_candidates) == 2:
            fill = _fill_from_sorted(sorted_all, exclude=wide_candidates, need=1, cap=1)
            wide_candidates = _unique_keep_order(wide_candidates + fill)
        actions_by_ticket['wide'][rid] = {'wide': wide_candidates if len(wide_candidates) >= 2 else []}

        # 馬単（BOX=順列）: Wが無ければ見送り
        if len(W) >= 1:
            umatan_candidates = [W[0]]
            umatan_candidates += [u for u in P if u != W[0]][:2]
            umatan_candidates = _unique_keep_order(umatan_candidates)
            actions_by_ticket['umatan'][rid] = {'umatan': umatan_candidates if len(umatan_candidates) >= 2 else []}
        else:
            actions_by_ticket['umatan'][rid] = {'umatan': []}

        # 三連複（BOX）: Pを土台に不足分を補完、上限5頭
        tri_candidates = _unique_keep_order(P[:3])
        if len(tri_candidates) < 3:
            tri_candidates += _fill_from_sorted(sorted_all, exclude=tri_candidates, need=(3 - len(tri_candidates)))
        if len(tri_candidates) < 3:
            actions_by_ticket['sanrenpuku'][rid] = {'sanrenpuku': []}
        else:
            # 上限5頭（5C3=10点）
            if len(tri_candidates) < 5:
                tri_candidates += _fill_from_sorted(sorted_all, exclude=tri_candidates, need=(5 - len(tri_candidates)), cap=(5 - len(tri_candidates)))
            tri_candidates = tri_candidates[:5]
            actions_by_ticket['sanrenpuku'][rid] = {'sanrenpuku': tri_candidates}

        # 三連単（BOX=順列）: Wが無ければ見送り、上限4頭（4P3=24点）
        if len(W) >= 1:
            tri1_candidates = [W[0]] + [u for u in P if u != W[0]]
            tri1_candidates = _unique_keep_order(tri1_candidates)
            if len(tri1_candidates) < 4:
                tri1_candidates += _fill_from_sorted(sorted_all, exclude=tri1_candidates, need=(4 - len(tri1_candidates)), cap=(4 - len(tri1_candidates)))
            tri1_candidates = tri1_candidates[:4]
            actions_by_ticket['sanrentan'][rid] = {'sanrentan': tri1_candidates if len(tri1_candidates) >= 3 else []}
        else:
            actions_by_ticket['sanrentan'][rid] = {'sanrentan': []}
    return actions_by_ticket

actions_by_ticket = build_actions_by_ticket(score_table_20251221)
print('tickets:', list(actions_by_ticket.keys()))
print('races in actions:', len(actions_by_ticket['tansho']))

In [None]:
# --- 5) 券種別に回収率を集計（払戻テーブルに無いrace_idは自動スキップ） ---
rows = []
detail_by_ticket = {}
for ticket, actions_ticket in actions_by_ticket.items():
    returns_per_race = simulator.calc_returns_per_race(actions_ticket)
    returns = simulator.calc_returns(actions_ticket)
    detail_by_ticket[ticket] = returns_per_race.sort_index()
    skipped_races = len(actions_ticket) - returns_per_race.index.nunique()
    rows.append({
        'ticket': ticket,
        'n_races_target': len(actions_ticket),
        'n_races_in_return_tables': returns_per_race.index.nunique(),
        'n_races_skipped': skipped_races,
        **returns,
    })

summary_20251221 = pd.DataFrame(rows).sort_values('ticket').reset_index(drop=True)
display(summary_20251221)

# 例: 単勝のレース別明細
display(detail_by_ticket['tansho'].head())

In [None]:
# --- 6.5) 払戻（return_tables）欠損の補完（欠損race_idのみ取得） ---
import pandas as pd

from modules.constants import LocalPaths
from modules.preparing._scrape_html import scrape_html_race
from modules.preparing._get_rawdata import get_rawdata_return, update_rawdata
from modules.preprocessing._return_processor import ReturnProcessor
from modules.simulation._simulator import Simulator

# return_tables に存在する race_id を抽出（MultiIndexにも対応）
raw_return_tables = return_processor.raw_data
if getattr(raw_return_tables.index, 'nlevels', 1) > 1:
    existing_race_ids = set(raw_return_tables.index.get_level_values(0).astype(str))
else:
    existing_race_ids = set(raw_return_tables.index.astype(str))

missing_race_ids = sorted(set(map(str, race_id_list)) - existing_race_ids)
print(f'missing race_id in return_tables: {len(missing_race_ids)}')
if len(missing_race_ids) > 0:
    display(pd.Series(missing_race_ids, name='missing_race_id').head(20))

    # 1) race html 取得（欠損分のみ）
    updated_html_paths = scrape_html_race(missing_race_ids, skip=False)

    # 2) raw return_tables 作成→既存pickleへ追記
    new_return_df = get_rawdata_return(updated_html_paths)
    _ = update_rawdata(LocalPaths.RAW_RETURN_TABLES_PATH, new_return_df, mode='update')

    # 3) ReturnProcessor/Simulator を作り直し
    return_processor = ReturnProcessor(LocalPaths.RAW_RETURN_TABLES_PATH)
    simulator = Simulator(return_processor)

    print('return_tables updated. 再集計したい場合は、直前の回収率集計セル（6.5）を再実行してください。')
else:
    print('欠損はありません（このまま回収率集計結果を採用できます）。')


## 6.6. 過去日（2026/01/04）の当日予想→券種別回収率シミュレーション

- 対象: 2026/01/04（日）
- race_id: 202606010501~12 / 202608010501~12 （合計24R）
- 6.5 と同じルールで actions を生成して回収率を集計

In [None]:
# --- 0) 対象レース設定（2026/01/04 全12R x 2開催 = 24レース） ---
import os
import time
import numpy as np
import pandas as pd

from modules import preparing, preprocessing, policies, training, simulation
from modules.constants import LocalPaths, ResultsCols

SIM_DATE_STR_20260104 = '2026/01/04'  # scrape_shutuba_table の date 引数（yyyy/mm/dd）
BASE_RACE_IDS_20260104 = [
    '202606010501',
    '202608010501',
]

race_id_list_20260104: list[str] = []
for base in BASE_RACE_IDS_20260104:
    prefix = base[:-2]
    race_id_list_20260104.extend([prefix + f'{i:02d}' for i in range(1, 13)])
race_id_list_20260104 = sorted(set(race_id_list_20260104))
print('race_id_list_20260104 size:', len(race_id_list_20260104))
# 6.5を実行していない環境でも動くように最低限を初期化
if 'keiba_ai' not in globals():
    MODEL_PATH = 'models/20260103/basemodel_2020_2025.pickle'
    if not os.path.exists(MODEL_PATH):
        candidates = [
            os.path.join('models', '20260103', f)
            for f in os.listdir(os.path.join('models', '20260103'))
            if f.startswith('basemodel_') and f.endswith('.pickle')
        ]
        if len(candidates) == 0:
            raise FileNotFoundError('basemodel_*.pickle が models/20260103 に見つかりません')
        MODEL_PATH = sorted(candidates)[-1]
        print('[WARN] 指定モデルが無いため自動選択:', MODEL_PATH)

    keiba_ai = training.KeibaAIFactory.load(MODEL_PATH)

if 'score_policy' not in globals():
    score_policy = policies.StdScorePolicy

# raw processors / simulator
horse_results_processor = preprocessing.HorseResultsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
horse_info_processor   = preprocessing.HorseInfoProcessor(filepath=LocalPaths.RAW_HORSE_INFO_PATH)
peds_processor         = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)
return_processor       = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)
simulator              = simulation.Simulator(return_processor)

if 'TARGET_COLS' not in globals():
    TARGET_COLS = []
    print('[WARN] TARGET_COLS が未定義なので空で進めます（特徴量が減ります）。')
if 'GROUP_COLS' not in globals():
    GROUP_COLS = []
    print('[WARN] GROUP_COLS が未定義なので空で進めます（特徴量が減ります）。')

out_dir_20260104 = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20260104')
os.makedirs(out_dir_20260104, exist_ok=True)
print('out_dir_20260104:', out_dir_20260104)

In [None]:
# --- 1) 出馬表スクレイピング（24レース） ---
created = 0
skipped = 0
for rid in race_id_list_20260104:
    out_path = os.path.join(out_dir_20260104, f'{rid}.pickle')
    if os.path.exists(out_path):
        skipped += 1
        continue
    time.sleep(1)  # サーバー負荷軽減
    preparing.scrape_shutuba_table(rid, SIM_DATE_STR_20260104, out_path)
    created += 1

print('scrape done. created=', created, 'skipped(existing)=', skipped, 'dir=', out_dir_20260104)

In [None]:
# --- 2) 出馬表pickleを結合 → 前処理 → マージ → 特徴量 ---
paths = [
    os.path.join(out_dir_20260104, f'{rid}.pickle')
    for rid in race_id_list_20260104
    if os.path.exists(os.path.join(out_dir_20260104, f'{rid}.pickle'))
]
print('available shutuba pickles:', len(paths), '/', len(race_id_list_20260104))

if len(paths) == 0:
    raise RuntimeError('出馬表pickleが1件もありません。先にスクレイピングセルを実行してください。')

raw_list = [pd.read_pickle(p) for p in paths]
shutuba_raw_20260104 = pd.concat(raw_list, axis=0, ignore_index=False)
shutuba_all_path_20260104 = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20260104_all.pickle')
shutuba_raw_20260104.to_pickle(shutuba_all_path_20260104)
print('saved:', shutuba_all_path_20260104, 'rows=', len(shutuba_raw_20260104))

shutuba_table_processor_20260104 = preprocessing.ShutubaTableProcessor(shutuba_all_path_20260104)

shutuba_data_merger_20260104 = preprocessing.ShutubaDataMerger(
    shutuba_table_processor_20260104,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS,
)
shutuba_data_merger_20260104.merge()

feature_enginnering_shutuba_20260104 = preprocessing.FeatureEngineering(shutuba_data_merger_20260104)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

X_shutuba_20260104 = feature_enginnering_shutuba_20260104.featured_data
print('X_shutuba_20260104 shape:', X_shutuba_20260104.shape)

In [None]:
# --- 3) スコア算出（レース内標準化） ---
score_table_20260104 = keiba_ai.calc_score(X_shutuba_20260104, score_policy)
print('score_table_20260104 shape:', score_table_20260104.shape)
display(score_table_20260104.head())

if ResultsCols.UMABAN in score_table_20260104.columns:
    score_table_20260104[ResultsCols.UMABAN] = pd.to_numeric(score_table_20260104[ResultsCols.UMABAN], errors='coerce').astype('Int64')
score_table_20260104['race_id'] = score_table_20260104['race_id'].astype(str)

In [None]:
# --- 4) 券種別ルールで actions を作る → 回収率を集計 ---
# 6.5で定義済みなら再利用、無ければここで定義
if 'build_actions_by_ticket' not in globals():
    def _umaban_col(df: pd.DataFrame) -> str:
        if ResultsCols.UMABAN in df.columns:
            return ResultsCols.UMABAN
        if 'umaban' in df.columns:
            return 'umaban'
        raise KeyError('umaban列が見つかりません')

    def _sorted_umaban_by_score(df_1race: pd.DataFrame) -> list[int]:
        df = df_1race.copy()
        if 'score' not in df.columns:
            raise KeyError('score列が見つかりません')
        df['score'] = pd.to_numeric(df['score'], errors='coerce')
        ucol = _umaban_col(df)
        df[ucol] = pd.to_numeric(df[ucol], errors='coerce')
        df = df.dropna(subset=['score', ucol]).sort_values('score', ascending=False)
        return df[ucol].astype(int).tolist()

    def _pick_W(df_1race: pd.DataFrame, low: float = 2.0, high: float = 2.7) -> list[int]:
        df = df_1race.copy()
        df['score'] = pd.to_numeric(df['score'], errors='coerce')
        ucol = _umaban_col(df)
        df[ucol] = pd.to_numeric(df[ucol], errors='coerce')
        df = df.dropna(subset=['score', ucol])
        df = df[(df['score'] >= float(low)) & (df['score'] <= float(high))]
        df = df.sort_values('score', ascending=False)
        return df[ucol].astype(int).tolist()

    def _pick_P(df_1race: pd.DataFrame, min_score: float = 1.6, cap_top: int = 3) -> list[int]:
        df = df_1race.copy()
        df['score'] = pd.to_numeric(df['score'], errors='coerce')
        ucol = _umaban_col(df)
        df[ucol] = pd.to_numeric(df[ucol], errors='coerce')
        df = df.dropna(subset=['score', ucol])
        df = df[df['score'] >= float(min_score)].sort_values('score', ascending=False)
        uma = df[ucol].astype(int).tolist()
        # 4頭以上いる場合は上位3頭に絞る（あなたのルール）
        if len(uma) >= 4:
            uma = uma[: int(cap_top)]
        return uma

    def _unique_keep_order(xs: list[int]) -> list[int]:
        seen = set()
        out: list[int] = []
        for x in xs:
            if x in seen:
                continue
            seen.add(x)
            out.append(x)
        return out

    def _fill_from_sorted(sorted_all: list[int], exclude: list[int], need: int, cap: int | None = None) -> list[int]:
        ex = set(exclude)
        out: list[int] = []
        for u in sorted_all:
            if u in ex:
                continue
            out.append(u)
            ex.add(u)
            if cap is not None and len(out) >= cap:
                break
            if len(out) >= need:
                break
        return out

    def build_actions_by_ticket(score_table: pd.DataFrame) -> dict[str, dict]:
        """新ルール準拠の買い目生成（Simulatorの仕様上、BOX/リストで表現）。

        W（勝ち軸候補）: 2.0 <= score <= 2.7
        P（複勝候補）: score >= 1.6（ただし4頭以上なら上位3頭にcap）

        単勝: W がいれば W に賭ける（いなければ見送り）
        複勝: P に賭ける（いなければ見送り）
        馬連: P のBOX（len(P) >= 2）
        ワイド: P のBOX（len(P)==2のときは次点を1頭補完して3頭に）
        馬単: W がいれば、[W上位1頭 + P上位2頭] を候補にしてBOX（順列）
        三連複: P を土台に不足分を次点で補完して最大5頭までBOX（len>=3）
        三連単: W がいれば候補最大4頭まででBOX（順列）（len>=3）
        """
        actions_by_ticket: dict[str, dict] = {
            'tansho': {},
            'fukusho': {},
            'umaren': {},
            'umatan': {},
            'wide': {},
            'sanrenpuku': {},
            'sanrentan': {},
        }
        for rid, df_r in score_table.groupby('race_id'):
            sorted_all = _sorted_umaban_by_score(df_r)
            W = _pick_W(df_r)
            P = _pick_P(df_r)

            # 単勝
            actions_by_ticket['tansho'][rid] = {'tansho': W}

            # 複勝
            actions_by_ticket['fukusho'][rid] = {'fukusho': P}

            # 馬連（BOX）
            actions_by_ticket['umaren'][rid] = {'umaren': P if len(P) >= 2 else []}

            # ワイド（BOX）: Pが2頭なら次点を1頭補完して3頭に
            wide_candidates = P
            if len(wide_candidates) == 2:
                fill = _fill_from_sorted(sorted_all, exclude=wide_candidates, need=1, cap=1)
                wide_candidates = _unique_keep_order(wide_candidates + fill)
            actions_by_ticket['wide'][rid] = {'wide': wide_candidates if len(wide_candidates) >= 2 else []}

            # 馬単（BOX=順列）: Wが無ければ見送り
            if len(W) >= 1:
                umatan_candidates = [W[0]]
                umatan_candidates += [u for u in P if u != W[0]][:2]
                umatan_candidates = _unique_keep_order(umatan_candidates)
                actions_by_ticket['umatan'][rid] = {'umatan': umatan_candidates if len(umatan_candidates) >= 2 else []}
            else:
                actions_by_ticket['umatan'][rid] = {'umatan': []}

            # 三連複（BOX）: Pを土台に不足分を補完、上限5頭
            tri_candidates = _unique_keep_order(P[:3])
            if len(tri_candidates) < 3:
                tri_candidates += _fill_from_sorted(sorted_all, exclude=tri_candidates, need=(3 - len(tri_candidates)))
            if len(tri_candidates) < 3:
                actions_by_ticket['sanrenpuku'][rid] = {'sanrenpuku': []}
            else:
                # 上限5頭（5C3=10点）
                if len(tri_candidates) < 5:
                    tri_candidates += _fill_from_sorted(sorted_all, exclude=tri_candidates, need=(5 - len(tri_candidates)), cap=(5 - len(tri_candidates)))
                tri_candidates = tri_candidates[:5]
                actions_by_ticket['sanrenpuku'][rid] = {'sanrenpuku': tri_candidates}

            # 三連単（BOX=順列）: Wが無ければ見送り、上限4頭（4P3=24点）
            if len(W) >= 1:
                tri1_candidates = [W[0]] + [u for u in P if u != W[0]]
                tri1_candidates = _unique_keep_order(tri1_candidates)
                if len(tri1_candidates) < 4:
                    tri1_candidates += _fill_from_sorted(sorted_all, exclude=tri1_candidates, need=(4 - len(tri1_candidates)), cap=(4 - len(tri1_candidates)))
                tri1_candidates = tri1_candidates[:4]
                actions_by_ticket['sanrentan'][rid] = {'sanrentan': tri1_candidates if len(tri1_candidates) >= 3 else []}
            else:
                actions_by_ticket['sanrentan'][rid] = {'sanrentan': []}
        return actions_by_ticket

actions_by_ticket_20260104 = build_actions_by_ticket(score_table_20260104)
print('tickets:', list(actions_by_ticket_20260104.keys()))
print('races in actions:', len(actions_by_ticket_20260104['tansho']))

rows = []
detail_by_ticket_20260104 = {}
for ticket, actions_ticket in actions_by_ticket_20260104.items():
    returns_per_race = simulator.calc_returns_per_race(actions_ticket)
    returns = simulator.calc_returns(actions_ticket)
    detail_by_ticket_20260104[ticket] = returns_per_race.sort_index()
    skipped_races = len(actions_ticket) - returns_per_race.index.nunique()
    rows.append({
        'ticket': ticket,
        'n_races_target': len(actions_ticket),
        'n_races_in_return_tables': returns_per_race.index.nunique(),
        'n_races_skipped': skipped_races,
        **returns,
    })

summary_20260104 = pd.DataFrame(rows).sort_values('ticket').reset_index(drop=True)
display(summary_20260104)

In [None]:
# --- 4.1) デバッグ: actions / return_tables の整合性チェック（必要なら消してOK） ---
rid = str(race_id_list_20260104[0])
print('sample race_id:', rid)

# actions の中身（空になっていないか）
try:
    print('tansho action sample:', actions_by_ticket_20260104['tansho'][rid])
    print('fukusho action sample:', actions_by_ticket_20260104['fukusho'][rid])
except Exception as e:
    print('actions sample check failed:', e)

# return_tables の中身（券種ラベルが入っているか）
try:
    raw_rt = pd.read_pickle(LocalPaths.RAW_RETURN_TABLES_PATH)
    one = raw_rt.loc[rid]
    print('return_tables rows for sample:', len(one) if hasattr(one, '__len__') else '1')
    # 先頭列(0)に券種名が入っている想定
    if isinstance(one, pd.Series):
        labels = [one.get(0)]
    else:
        labels = one[0].astype(str).unique().tolist()
    print('ticket labels in return_tables sample:', labels)
except Exception as e:
    print('return_tables sample check failed:', e)

In [None]:
# --- 4.5) 修正したモジュールをリロード（Notebookに反映） ---
import importlib

# constants 側（UrlPathsなど）も合わせてreloadする
import modules.constants._url_paths as _url_paths
import modules.constants as _constants

import modules.preparing._get_rawdata as _get_rawdata
import modules.preparing._scrape_html as _scrape_html

importlib.reload(_url_paths)
importlib.reload(_constants)
importlib.reload(_get_rawdata)
importlib.reload(_scrape_html)

# 再importして、このNotebook内の参照も更新
from modules.preparing._get_rawdata import get_rawdata_return, update_rawdata
from modules.preparing._scrape_html import scrape_html_race

print('reloaded:', _url_paths.__name__, _constants.__name__, _get_rawdata.__name__, _scrape_html.__name__)

In [None]:
# --- 5) 払戻（return_tables）欠損の確認＆必要なら補完 ---
import re
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup

from modules.constants import LocalPaths
from modules.preprocessing._return_processor import ReturnProcessor
from modules.simulation._simulator import Simulator

# NOTE: get_rawdata_return / scrape_html_race / update_rawdata は直前の「リロード」セルで再import済み
raw_return_tables = return_processor.raw_data
if getattr(raw_return_tables.index, 'nlevels', 1) > 1:
    existing_race_ids = set(raw_return_tables.index.get_level_values(0).astype(str))
else:
    existing_race_ids = set(raw_return_tables.index.astype(str))

missing_race_ids_20260104 = sorted(set(map(str, race_id_list_20260104)) - existing_race_ids)
print(f'missing race_id in return_tables (20260104): {len(missing_race_ids_20260104)}')

# 文字化け修正などで「既存データを置換したい」場合は True
FORCE_REFRESH_RETURN_20260104 = False  # True にすると全24Rを再取得して置換
target_race_ids = list(map(str, race_id_list_20260104)) if FORCE_REFRESH_RETURN_20260104 else missing_race_ids_20260104
update_mode = 'replace' if FORCE_REFRESH_RETURN_20260104 else 'update'

if len(target_race_ids) > 0:
    if not FORCE_REFRESH_RETURN_20260104:
        display(pd.Series(target_race_ids, name='missing_race_id').head(20))
    else:
        print(f'FORCE_REFRESH_RETURN_20260104=True: 対象 {len(target_race_ids)}R を再取得して置換します')

    updated_html_paths = scrape_html_race(target_race_ids, skip=False)
    print('scraped html files:', len(updated_html_paths))
    if len(updated_html_paths) > 0:
        display(pd.Series(updated_html_paths, name='updated_html_path').head(5))

    try:
        new_return_df = get_rawdata_return(updated_html_paths)
    except Exception as e:
        print('get_rawdata_return failed.')
        print(str(e))

        # 失敗時の補助情報: race_id 推定と HTML 簡易診断
        rids = []
        for p in updated_html_paths:
            m = re.findall(r'race\W(\d+)\.bin', str(p))
            if m:
                rids.append(m[0])
        print('race_id inferred from updated_html_paths (sample):', rids[:20])

        for p in updated_html_paths[:3]:
            path = Path(p)
            b = path.read_bytes() if path.exists() else b''
            lower = b.lower()
            if (b'euc-jp' in lower) or (b'euc_jp' in lower):
                s = b.decode('euc-jp', errors='ignore')
            else:
                s = b.decode('utf-8', errors='ignore')
            soup = BeautifulSoup(s, 'lxml')
            data_intro = soup.find('div', attrs={'class': 'data_intro'})
            h1 = ''
            if data_intro:
                hs = data_intro.find_all('h1')
                if hs:
                    h1 = hs[0].get_text(strip=True)
            og_title = ''
            og = soup.find('meta', attrs={'property': 'og:title'})
            if og and og.get('content'):
                og_title = og.get('content')
            print('--- html diagnose ---')
            print('path:', str(path))
            print('size:', len(b))
            print('data_intro h1:', h1)
            print('og:title:', og_title)

        raise
    _ = update_rawdata(LocalPaths.RAW_RETURN_TABLES_PATH, new_return_df, mode=update_mode)

    return_processor = ReturnProcessor(LocalPaths.RAW_RETURN_TABLES_PATH)
    simulator = Simulator(return_processor)

    print('return_tables updated. 必要なら、上の回収率集計セルを再実行してください。')
else:
    print('欠損はありません（このまま回収率集計結果を採用できます）。')

In [None]:
# --- 診断: 学習時特徴量(train_cols) vs 推論特徴量(X_debug) の差分一覧 ---
import pandas as pd
from modules.constants import ResultsCols

# 比較対象のXを決める（基本は X_debug）
if 'X_debug' in globals():
    X_cmp = X_debug.copy()
elif 'X_shutuba_20260104' in globals():
    X_cmp = X_shutuba_20260104.copy()
elif 'X_shutuba' in globals():
    X_cmp = X_shutuba.copy()
else:
    raise NameError('X_debug も X_shutuba_20260104 も X_shutuba も見つかりません')

# 学習時の列（keiba_aiが持つデータセット由来）
train_cols = list(keiba_ai.datasets.X_train.columns)
x_cols = list(X_cmp.columns)

missing_in_inference = [c for c in train_cols if c not in x_cols]
extra_in_inference = [c for c in x_cols if c not in train_cols]

print('=== Feature diff ===')
print('train cols:', len(train_cols))
print('inference cols:', len(x_cols))
print('missing_in_inference (train - inference):', len(missing_in_inference))
print('extra_in_inference (inference - train):', len(extra_in_inference))

if len(missing_in_inference) > 0:
    display(pd.DataFrame({'missing_in_inference': missing_in_inference}))
if len(extra_in_inference) > 0:
    display(pd.DataFrame({'extra_in_inference': extra_in_inference}))

# 参考: LightGBMモデルが保持しているfeature名の状態
if 'model' not in globals():
    try:
        model = getattr(keiba_ai, '_KeibaAI__model_wrapper').lgb_model
    except Exception:
        model = None
if model is not None:
    fn_attr = getattr(model, 'feature_name_', None)
    fn_booster = None
    try:
        fn_booster = model.booster_.feature_name()
    except Exception:
        pass
    print('\n=== Model feature names ===')
    print('feature_name_ exists:', fn_attr is not None, 'len:', (len(fn_attr) if fn_attr is not None else None))
    if fn_attr is not None:
        print('feature_name_ head:', list(fn_attr)[:10])
    print('booster_.feature_name() exists:', fn_booster is not None, 'len:', (len(fn_booster) if fn_booster is not None else None))
    if fn_booster is not None:
        print('booster feature_name head:', list(fn_booster)[:10])

# 参考: 推論側で欠けている列が全て0埋めで問題ないかざっくり確認（存在する列のみ）
if len(missing_in_inference) > 0:
    print('\n[NOTE] 欠けている列は推論時に0埋めして整列可能です。')

## 旧モデル vs 新モデル（同条件）: 閾値スイープ比較

- **目的**: 旧モデル（20260103）と新モデル（20260108）を、**同じ評価データ**（新モデルの `X_test`）に対してスコア→単勝/複勝の閾値スイープを回し、回収率曲線を比較する。
- **注意**: `simulator` と `return_processor`、および `T_RANGE`/`N_SAMPLES` が事前に作成済みである前提。

In [None]:
# モデルパス（ユーザー提示の絶対パス）
OLD_MODEL_PATH = r"C:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\models\20260103\basemodel_2020_2025.pickle"
NEW_MODEL_PATH = r"C:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\models\20260108\basemodel_2020_2025.pickle"

print('OLD_MODEL_PATH:', OLD_MODEL_PATH)
print('NEW_MODEL_PATH:', NEW_MODEL_PATH)

In [None]:
# 旧/新モデルをロード
# 既に training / policies が import 済みの前提（未importなら上流セルを実行）
keiba_ai_old = training.KeibaAIFactory.load(OLD_MODEL_PATH)
keiba_ai_new = training.KeibaAIFactory.load(NEW_MODEL_PATH)

print('loaded old:', type(keiba_ai_old))
print('loaded new:', type(keiba_ai_new))
print('old model features:', getattr(keiba_ai_old._KeibaAI__model_wrapper.lgb_model, 'n_features_in_', None))
print('new model features:', getattr(keiba_ai_new._KeibaAI__model_wrapper.lgb_model, 'n_features_in_', None))

In [None]:
# 共通の評価入力Xを作る（新モデルのtestセットを基準にする）
# ※ 全期間の test_data は行数が非常に多く、閾値スイープが重くなります。
#    まずは「直近のレースだけ」に絞って比較し、必要なら上限を外してください。

from modules.constants import ResultsCols
import pandas as pd

# 直近レースに絞るパラメータ（必要に応じて変更）
EVAL_MAX_RACES = 2000   # None にすると全レース（非常に重い）
EVAL_MIN_DATE = None    # 例: '2025-01-01'（文字列でOK）

# 新モデル側の test_data をベースにする
_eval_df = keiba_ai_new.datasets.test_data.copy()

if EVAL_MIN_DATE is not None and 'date' in _eval_df.columns:
    _eval_df = _eval_df[_eval_df['date'] >= pd.to_datetime(EVAL_MIN_DATE)]

if EVAL_MAX_RACES is not None:
    if 'race_id' in _eval_df.columns:
        # date順に見て「直近のrace_id」を残す（重複は落とす）
        if 'date' in _eval_df.columns:
            race_order = (
                _eval_df.reset_index()
                .sort_values('date')
                .drop_duplicates('race_id', keep='last')['race_id']
            )
        else:
            race_order = _eval_df['race_id'].drop_duplicates()

        keep_race_ids = set(race_order.tail(int(EVAL_MAX_RACES)))
        _eval_df = _eval_df[_eval_df['race_id'].isin(keep_race_ids)]

# 新モデル側 X_test から、同じ index の行を抜く
X_eval = keiba_ai_new.datasets.X_test.loc[_eval_df.index].copy()

# race_id/umaban を付与（score_table生成＆Simulatorで必須）
meta_cols = [c for c in [ResultsCols.UMABAN, 'race_id'] if c in _eval_df.columns]
for c in meta_cols:
    X_eval[c] = _eval_df[c].to_numpy()

print('EVAL_MAX_RACES:', EVAL_MAX_RACES)
print('EVAL_MIN_DATE:', EVAL_MIN_DATE)
print('X_eval shape:', X_eval.shape)
print('X_eval has race_id:', 'race_id' in X_eval.columns)
print('X_eval has umaban:', ResultsCols.UMABAN in X_eval.columns)
if 'race_id' in X_eval.columns:
    print('n_eval_races:', pd.Series(X_eval['race_id']).nunique())
print('n_eval_rows:', len(X_eval))

In [None]:
# 閾値スイープ（単勝/複勝）を旧/新モデルで並走
import numpy as np
import pandas as pd
import traceback

# ベスト閾値に課す制約（ユーザー指定）
MIN_RACES = 200
MIN_BETS = 1000


def _thresholds(N_SAMPLES, T_RANGE):
    if N_SAMPLES is None or N_SAMPLES <= 1:
        return [float(T_RANGE[0])]
    lo, hi = float(T_RANGE[0]), float(T_RANGE[1])
    return [lo + (hi - lo) * i / (N_SAMPLES - 1) for i in range(int(N_SAMPLES))]


def _ensure_returns_schema(df: pd.DataFrame) -> pd.DataFrame:
    # Simulator.calc_returns が空dictを返した場合でも列を揃える
    cols = ['n_bets', 'n_races', 'n_hits', 'total_bet_amount', 'return_rate', 'std']
    for c in cols:
        if c not in df.columns:
            df[c] = 0
    return df[cols].fillna(0)


def sweep_returns(keiba_ai_obj, X_input: pd.DataFrame, bet_policy, label: str) -> pd.DataFrame:
    # スコアテーブル
    score_table_local = keiba_ai_obj.calc_score(X_input, policies.StdScorePolicy)

    returns = {}
    for threshold in _thresholds(N_SAMPLES, T_RANGE):
        try:
            actions = keiba_ai_obj.decide_action(
                score_table_local,
                bet_policy,
                threshold=float(threshold),
            )
            returns[float(threshold)] = simulator.calc_returns(actions)
        except Exception:
            print(f'[{label}] threshold={threshold} failed')
            traceback.print_exc()
            returns[float(threshold)] = {}

    returns_df = pd.DataFrame.from_dict(returns, orient='index').sort_index()
    returns_df.index.name = 'threshold'
    returns_df = _ensure_returns_schema(returns_df)
    return returns_df


def summarize_best(returns_df: pd.DataFrame) -> pd.Series:
    if returns_df.empty:
        return pd.Series(dtype=float)
    best_th = returns_df['return_rate'].astype(float).idxmax()
    best_row = returns_df.loc[best_th].copy()
    best_row['best_threshold'] = float(best_th)
    return best_row


def summarize_best_constrained(returns_df: pd.DataFrame, min_races: int, min_bets: int) -> pd.Series:
    if returns_df.empty:
        return pd.Series(dtype=float)
    mask = (returns_df['n_races'].astype(float) >= float(min_races)) & (returns_df['n_bets'].astype(float) >= float(min_bets))
    constrained = returns_df.loc[mask]
    if constrained.empty:
        s = pd.Series({
            'n_bets': 0,
            'n_races': 0,
            'n_hits': 0,
            'total_bet_amount': 0,
            'return_rate': 0,
            'std': 0,
            'best_threshold': np.nan,
        })
        return s
    best_th = constrained['return_rate'].astype(float).idxmax()
    best_row = constrained.loc[best_th].copy()
    best_row['best_threshold'] = float(best_th)
    return best_row


# --- 単勝 ---
old_tansho_df = sweep_returns(keiba_ai_old, X_eval, policies.BetPolicyTansho, label='old_tansho')
new_tansho_df = sweep_returns(keiba_ai_new, X_eval, policies.BetPolicyTansho, label='new_tansho')

# --- 複勝 ---
old_fukusho_df = sweep_returns(keiba_ai_old, X_eval, policies.BetPolicyFukusho, label='old_fukusho')
new_fukusho_df = sweep_returns(keiba_ai_new, X_eval, policies.BetPolicyFukusho, label='new_fukusho')

# サマリ（制約なし）
summary_unconstrained = pd.DataFrame([
    summarize_best(old_tansho_df).rename('old_tansho'),
    summarize_best(new_tansho_df).rename('new_tansho'),
    summarize_best(old_fukusho_df).rename('old_fukusho'),
    summarize_best(new_fukusho_df).rename('new_fukusho'),
])

# サマリ（制約あり）
summary_constrained = pd.DataFrame([
    summarize_best_constrained(old_tansho_df, MIN_RACES, MIN_BETS).rename('old_tansho'),
    summarize_best_constrained(new_tansho_df, MIN_RACES, MIN_BETS).rename('new_tansho'),
    summarize_best_constrained(old_fukusho_df, MIN_RACES, MIN_BETS).rename('old_fukusho'),
    summarize_best_constrained(new_fukusho_df, MIN_RACES, MIN_BETS).rename('new_fukusho'),
])

print(f'Constraints: n_races>={MIN_RACES}, n_bets>={MIN_BETS}')
print('--- Unconstrained best ---')
display(summary_unconstrained)
print('--- Constrained best ---')
display(summary_constrained)

In [None]:
# 旧/新モデルの回収率カーブを重ねて表示（単勝/複勝）
# ※ plot_single_threshold_compare が前のセル群で定義済みの前提

plot_single_threshold_compare(old_tansho_df, new_tansho_df, N_SAMPLES, label1='old_tansho', label2='new_tansho')

plot_single_threshold_compare(old_fukusho_df, new_fukusho_df, N_SAMPLES, label1='old_fukusho', label2='new_fukusho')