# 1. モジュールインポート

In [1]:
import pandas as pd
import glob
import os
import datetime
from tqdm.auto import tqdm
from modules.constants import Master
from modules.constants import LocalPaths
from modules.constants import HorseResultsCols
from modules.constants import ResultsCols
from modules import preparing
from modules import preprocessing
from modules import training
from modules import simulation
from modules import policies
%load_ext autoreload

In [None]:
# エンコーディング設定の確認
import sys
import locale

print("=== エンコーディング設定確認 ===")
print(f"sys.getdefaultencoding(): {sys.getdefaultencoding()}")
print(f"sys.getfilesystemencoding(): {sys.getfilesystemencoding()}")
print(f"locale.getpreferredencoding(): {locale.getpreferredencoding()}")
print(f"PYTHONUTF8 環境変数: {sys.flags.utf8_mode}")

標準的な土日競馬開催時の運用スケジュールを以下の表の通り。

|曜日|時刻|内容|実行する main.ipynb の項番|備考|
|:-:|:--|:--|:--|:--|
|月|||||
|火|||||
|水|16:30過ぎ|先週土日の馬の過去成績ページ確定<BR>（netkeiba.comﾌﾟﾚﾐｱｻｰﾋﾞｽのﾀｲﾑ指数・ﾚｰｽ分析・注目馬 ﾚｰｽ後の短評情報確定）|2. データ取得 ～ 5. シミュレーション|3日間開催の場合も、水曜日|
|木|||||
|金|10:05過ぎ<BR>19:25過ぎ|土曜の出馬表確定<BR>土曜の天候・馬場状態更新|6.1. 前日準備 ～ 6.2. 前日全レース予想（天候・馬場状態は手動設定）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想||
|土|09:00～17:00<BR>10:05過ぎ<BR>19:25過ぎ| レース時刻<BR>日曜の出馬表確定<BR>日曜の天候・馬場状態更新|6.3. レース直前データ処理（当日レース予想）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想（天候・馬場状態は手動設定）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想||
|日|09:00～17:00|レース時刻|6.3. レース直前データ処理（当日レース予想）||

# 2. データ取得

## 2.1. レースID取得
例として、2020年のレースデータを取得する場合を考える

In [2]:
%autoreload

In [None]:
#開催日取得。to_の月は含まないので注意。
kaisai_date_2025 = preparing.scrape_kaisai_date(from_="2025-12-01", to_="2026-01-01")
len(kaisai_date_2025)

In [None]:
# 開催日からレースIDの取得
race_id_list = preparing.scrape_race_id_list(kaisai_date_2025)
len(race_id_list)

## 2.2. /race/ディレクトリのデータ取得

In [None]:
#https://db.netkeiba.com/race/のhtml(binファイル)をスクレイピングして保存
html_files_race = preparing.scrape_html_race(race_id_list, skip=True)
html_files_race[:5]

In [None]:
# data/html/race/に保存されているhtml(binファイル)をリストにする
import glob
import os

# LocalPathsからHTMLレースディレクトリを取得
race_html_dir = LocalPaths.HTML_RACE_DIR
print(f"レースHTMLディレクトリ: {race_html_dir}")

# globでbinファイルを検索
html_files_race = glob.glob(os.path.join(race_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_race)}")

# 最初の5ファイルを表示
html_files_race[:5]
html_files_race[:5]

In [None]:
# 変数確認
print(f"html_files_race変数の状態:")
print(f"  タイプ: {type(html_files_race)}")
print(f"  サイズ: {len(html_files_race)}")
print(f"  範囲: {html_files_race[0].split('\\')[-1]} ～ {html_files_race[-1].split('\\')[-1]}")

# これで次のセルでget_rawdata_results関数を正常に実行できます

In [None]:
results_new = preparing.get_rawdata_results(html_files_race) #レース結果テーブルの作成
race_info_new = preparing.get_rawdata_info(html_files_race) #レース情報テーブルの作成
return_tables_new = preparing.get_rawdata_return(html_files_race) #払戻テーブルの作成

In [None]:
# テーブルの更新。元々のテーブルが存在しない場合は、新たに作成される。
preparing.update_rawdata(filepath=LocalPaths.RAW_RESULTS_PATH, new_df=results_new)
preparing.update_rawdata(filepath=LocalPaths.RAW_RACE_INFO_PATH, new_df=race_info_new)
preparing.update_rawdata(filepath=LocalPaths.RAW_RETURN_TABLES_PATH, new_df=return_tables_new)

## 2.x. 生成済み raw テーブル確認
`data/raw` に保存された各pickleの基本情報を表示します。存在しない場合はスキップします。

In [None]:
# data/raw 配下の pickle テーブル概要確認 + null率集計
import os
import pathlib
import pandas as pd
import datetime as dt
from modules.constants import LocalPaths

RAW_DIR = pathlib.Path('data/raw')

if not RAW_DIR.exists():
    print(f'ディレクトリが存在しません: {RAW_DIR.resolve()}')
else:
    pickle_files = sorted(RAW_DIR.glob('*.pickle'))
    if not pickle_files:
        print('pickleファイルが見つかりません。先に取得処理を実行してください。')
    else:
        summaries = []
        null_detail_rows = []  # 列単位 null 率詳細
        for p in pickle_files:
            info = {
                'file': p.name,
                'size_MB': round(p.stat().st_size / 1_000_000, 3)
            }
            try:
                df = pd.read_pickle(p)
                info['rows'] = len(df)
                info['cols'] = df.shape[1]
                info['memory_MB'] = round(df.memory_usage(deep=True).sum() / 1_000_000, 3)
                # 代表的なカラムサンプル（最大5件）
                info['sample_cols'] = ', '.join(list(df.columns[:5]))
                # 日付らしき列から範囲を取得
                date_cols = [c for c in df.columns if 'date' in c.lower()]
                date_range = ''
                for dc in date_cols:
                    try:
                        s = pd.to_datetime(df[dc], errors='coerce')
                        if s.notna().any():
                            date_range = f"{dc}:{s.min().date()}→{s.max().date()}"
                            break
                    except Exception:
                        pass
                info['date_range'] = date_range
                # 全体 null 率（セル全体）
                total_cells = df.shape[0] * (df.shape[1] if df.shape[0] else 0)
                info['overall_null_pct'] = round((df.isna().sum().sum() / total_cells) * 100, 2) if total_cells else 0.0
                # 列ごとの null 率
                col_null_pct = (df.isna().mean() * 100).sort_values(ascending=False)
                # 上位10列を詳細に保存（列が10未満なら全て）
                for col, pct in col_null_pct.head(10).items():
                    null_detail_rows.append({
                        'file': p.name,
                        'column': col,
                        'null_pct': round(pct, 2)
                    })
                # 列単位統計（最大値/平均値/中央値）
                info['max_col_null_pct'] = round(col_null_pct.iloc[0], 2) if not col_null_pct.empty else 0.0
                info['mean_col_null_pct'] = round(col_null_pct.mean(), 2) if not col_null_pct.empty else 0.0
                info['median_col_null_pct'] = round(col_null_pct.median(), 2) if not col_null_pct.empty else 0.0
            except Exception as e:
                info['rows'] = 'ERR'
                info['cols'] = 'ERR'
                info['memory_MB'] = 'ERR'
                info['sample_cols'] = f'load error: {e.__class__.__name__}'
                info['date_range'] = ''
                info['overall_null_pct'] = 'ERR'
                info['max_col_null_pct'] = 'ERR'
                info['mean_col_null_pct'] = 'ERR'
                info['median_col_null_pct'] = 'ERR'
            summaries.append(info)
        summary_df = pd.DataFrame(summaries)
        # 表示順を調整
        summary_cols_order = [
            'file','rows','cols','size_MB','memory_MB','overall_null_pct',
            'max_col_null_pct','mean_col_null_pct','median_col_null_pct',
            'sample_cols','date_range'
        ]
        summary_df = summary_df[summary_cols_order]
        display(summary_df)

        if null_detail_rows:
            null_detail_df = pd.DataFrame(null_detail_rows)
            # ファイル毎に null の高い列を横持ち要約（pivot）するオプション（必要であれば）
            display(null_detail_df)

        # 主要パスが指すファイルの存在と行数確認（存在しない場合も出力）
        main_paths = {
            'RAW_RESULTS_PATH': getattr(LocalPaths, 'RAW_RESULTS_PATH', None),
            'RAW_RACE_INFO_PATH': getattr(LocalPaths, 'RAW_RACE_INFO_PATH', None),
            'RAW_RETURN_TABLES_PATH': getattr(LocalPaths, 'RAW_RETURN_TABLES_PATH', None),
            'RAW_HORSE_INFO_PATH': getattr(LocalPaths, 'RAW_HORSE_INFO_PATH', None),
            'RAW_HORSE_RESULTS_PATH': getattr(LocalPaths, 'RAW_HORSE_RESULTS_PATH', None),
            'RAW_PEDS_PATH': getattr(LocalPaths, 'RAW_PEDS_PATH', None)
        }
        path_rows = []
        for key, path in main_paths.items():
            if path is None:
                path_rows.append({'name': key, 'path': None, 'exists': False, 'rows': None})
                continue
            exists = os.path.isfile(path)
            rows = None
            if exists:
                try:
                    rows = len(pd.read_pickle(path))
                except Exception:
                    rows = 'ERR'
            path_rows.append({'name': key, 'path': path, 'exists': exists, 'rows': rows})
        display(pd.DataFrame(path_rows))

In [None]:
# 既存のresultsデータを読み込んでテスト用horse_idリストを取得
results_new = pd.read_pickle(LocalPaths.RAW_RESULTS_PATH)
print(f"results_new loaded: {results_new.shape}")

# 先頭10頭のテスト用リスト作成
horse_id_list = results_new['horse_id'].unique()
horse_id_test_list = horse_id_list[:10]
print(f"テスト用horse_id: {horse_id_test_list}")

## 2.3. /horse/ディレクトリのデータ取得

In [None]:
%autoreload

In [None]:
# マスターファイルにNaN値が含まれているものだけ再スクレイピングする
import pandas as pd
import os
from modules.constants import LocalPaths

# マスターファイルからNaN値を持つhorse_idを特定
master_files = {
    'horse_id': 'horse_id.csv',
    'jockey_id': 'jockey_id.csv', 
    'trainer_id': 'trainer_id.csv',
    'owner_id': 'owner_id.csv',
    'breeder_id': 'breeder_id.csv'
}

nan_horse_ids = set()
print("=== マスターファイルのNaN値チェック ===")

# horse_info.pickleを読み込み
try:
    horse_info = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
    print(f"horse_info.pickle読み込み完了: {len(horse_info)}頭の馬データ")
except Exception as e:
    print(f"horse_info.pickleの読み込みエラー: {e}")
    horse_info = None

# 各マスターファイルをチェック
for master_type, filename in master_files.items():
    filepath = os.path.join(LocalPaths.MASTER_DIR, filename)
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        id_col = df.columns[0]
        
        # NaN値を持つ行を特定
        nan_rows = df[df[id_col].isna()]
        if len(nan_rows) > 0:
            print(f"\n{master_type}: {len(nan_rows)}個のNaN値を発見")
            
            if horse_info is not None:
                if master_type == 'horse_id':
                    # horse_idが直接NaNの場合（これは通常起こらない）
                    print(f"  -> horse_idが直接NaNになっている行: {len(nan_rows)}")
                else:
                    # 他のIDがNaNの馬を特定
                    col_mapping = {
                        'jockey_id': '騎手',
                        'trainer_id': '調教師', 
                        'owner_id': '馬主',
                        'breeder_id': '生産者'
                    }
                    if master_type in col_mapping:
                        target_col = col_mapping[master_type]
                        if target_col in horse_info.columns:
                            # 該当する列がNaNまたは空文字の馬を特定
                            nan_horses = horse_info[
                                (horse_info[target_col].isna()) | 
                                (horse_info[target_col] == '') |
                                (horse_info[target_col] == 'nan')
                            ]
                            for horse_id in nan_horses.index:
                                nan_horse_ids.add(horse_id)
                            print(f"  -> {target_col}がNaN/空の馬: {len(nan_horses)}頭")
                            if len(nan_horses) > 0:
                                print(f"      例: {list(nan_horses.index)[:5]}")
    else:
        print(f"{master_type}: ファイルが存在しません")

print(f"\n=== 再スクレイピング対象の特定結果 ===")
print(f"再スクレイピングが必要な馬ID数: {len(nan_horse_ids)}")

if len(nan_horse_ids) > 0:
    nan_horse_ids_list = sorted(list(nan_horse_ids))
    print(f"対象馬ID例: {nan_horse_ids_list[:10]}{'...' if len(nan_horse_ids_list) > 10 else ''}")
    
    print(f"\n=== 再スクレイピング実行オプション ===")
    print("以下の変数を設定して次のセルで実行してください：")
    print("re_scrape_horses = True  # この行のコメントアウトを外して実行")
    print("target_horse_ids = nan_horse_ids_list  # 対象馬IDリスト")
    
    # 変数を保存（次のセルで使用）
    globals()['nan_horse_ids_list'] = nan_horse_ids_list
    globals()['re_scrape_needed'] = True
else:
    print("再スクレイピングが必要な馬は見つかりませんでした。")
    globals()['re_scrape_needed'] = False

In [None]:
# horse_id_listのうち先頭10頭の馬のリストを作成し、スクレイピングテストする
horse_id_list = results_new['horse_id'].unique()
horse_id_test_list = horse_id_list[:10]  # 先頭10頭でテスト

print(f"全体の馬数: {len(horse_id_list)}")
print(f"テスト対象の馬数: {len(horse_id_test_list)}")
print(f"テスト対象馬ID: {horse_id_test_list}")

#htmlをスクレイピング
#すでにスクレイピングしてある馬をスキップしたい場合はskip=Trueにする
#すでにスクレイピングしてある馬でも、新たに出走した成績を更新したい場合はskip=Falseにする
html_files_horse = preparing.scrape_html_horse_with_master(
    horse_id_test_list, skip=False
    )

In [None]:
horse_id_list = results_new['horse_id'].unique()
#htmlをスクレイピング
#すでにスクレイピングしてある馬をスキップしたい場合はskip=Trueにする
#すでにスクレイピングしてある馬でも、新たに出走した成績を更新したい場合はskip=Falseにする
html_files_horse = preparing.scrape_html_horse_with_master(
    horse_id_list, skip=True
    )

In [None]:
#追加で新たにスクレイピングされた数
len(html_files_horse)

In [None]:
### scrape関数を実行せずに、保存してあるhtmlのパスを取得する場合、以下を実行 ###

target_date = '2025-09-20' #スクレイピングした日付を指定
# マスタの読み込み
update_master = pd.read_csv(
    LocalPaths.MASTER_RAW_HORSE_RESULTS_PATH,
    dtype=object
    )
# target_dateにスクレイピングしたhorse_idに絞り込む
filter = pd.to_datetime(update_master['updated_at']).dt.strftime('%Y-%m-%d') == target_date
horse_id_list = update_master[filter]['horse_id']

# binファイルのパスを取得
html_files_horse = []
for horse_id in tqdm(horse_id_list):
    file = glob.glob(os.path.join(LocalPaths.HTML_HORSE_DIR, horse_id+'*.bin'))[0]
    html_files_horse.append(file)
html_files_horse[:5]

In [None]:
# マスターファイルにNaN値が含まれているものだけ再スクレイピングする


In [None]:
# 再スクレイピング実行（上のセルで特定されたNaN馬のみ）
# 実行する場合は下の行のコメントアウトを外してください
# re_scrape_horses = True

if 're_scrape_horses' in locals() and re_scrape_horses and 're_scrape_needed' in globals() and re_scrape_needed:
    print("=== NaN値を持つ馬の再スクレイピング開始 ===")
    print(f"対象馬数: {len(nan_horse_ids_list)}")
    
    # 進捗表示用
    from tqdm import tqdm
    import time
    
    success_count = 0
    failed_ids = []
    
    for i, horse_id in enumerate(tqdm(nan_horse_ids_list, desc="再スクレイピング")):
        try:
            # 馬情報をスクレイピング
            scrape_horse_html(horse_id)
            success_count += 1
            
            # サーバー負荷軽減のため少し待機
            if i % 10 == 0 and i > 0:
                time.sleep(1)
                
        except Exception as e:
            print(f"エラー - 馬ID {horse_id}: {e}")
            failed_ids.append(horse_id)
            continue
    
    print(f"\n=== 再スクレイピング完了 ===")
    print(f"成功: {success_count}頭")
    print(f"失敗: {len(failed_ids)}頭")
    
    if failed_ids:
        print(f"失敗した馬ID: {failed_ids[:10]}{'...' if len(failed_ids) > 10 else ''}")
    
    print("\n再スクレイピング完了後は、該当セクションのデータ処理を再実行してください。")
    
elif 're_scrape_needed' in globals() and not re_scrape_needed:
    print("再スクレイピングが必要な馬は見つかりませんでした。")
    
else:
    print("再スクレイピングを実行するには以下を設定してください：")
    print("1. 上のセルを実行してNaN馬を特定")
    print("2. 're_scrape_horses = True' のコメントアウトを外す")
    print("3. このセルを再実行")

In [None]:
# 再スクレイピング後のデータ再処理
# 上の再スクレイピングが完了した後に実行してください
# reprocess_data = True  # コメントアウトを外して実行

if 'reprocess_data' in locals() and reprocess_data:
    print("=== 再スクレイピング後のデータ再処理開始 ===")
    
    # 1. 新しくスクレイピングしたHTMLから馬情報を再抽出
    print("1. 馬情報の再抽出...")
    from modules.preprocessing._horse_info_processor import HorseInfoProcessor
    
    # 新しいHTMLファイルのみ処理
    html_files_horse_new = []
    for horse_id in nan_horse_ids_list:
        html_file = os.path.join(LocalPaths.HTML_HORSE_DIR, f"{horse_id}.bin")
        if os.path.exists(html_file):
            html_files_horse_new.append(html_file)
    
    print(f"再処理対象HTMLファイル数: {len(html_files_horse_new)}")
    
    if len(html_files_horse_new) > 0:
        # 馬情報を再処理
        horse_info_processor = HorseInfoProcessor(html_files_horse_new)
        horse_info_new = horse_info_processor.scrape_horse_info()
        
        # 既存の馬情報に新しい情報をマージ
        try:
            horse_info_existing = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
            # 新しい情報で既存の情報を更新
            horse_info_updated = horse_info_existing.copy()
            for horse_id in horse_info_new.index:
                horse_info_updated.loc[horse_id] = horse_info_new.loc[horse_id]
            
            # バックアップ作成
            horse_info_existing.to_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle.bak'))
            
            # 更新されたデータを保存
            horse_info_updated.to_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
            
            print(f"馬情報更新完了: {len(horse_info_new)}頭の情報を更新")
            
        except Exception as e:
            print(f"馬情報の更新エラー: {e}")
    
    # 2. マスターファイルの再生成
    print("\n2. マスターファイルの再生成...")
    
    # horse_info.pickleから各種IDを抽出してマスターファイルを更新
    try:
        horse_info_updated = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
        
        # 各マスターファイルを更新
        id_columns = {
            'horse_id.csv': ('index', 'horse_id'),  # インデックスがhorse_id
            'jockey_id.csv': ('騎手', 'jockey_id'),
            'trainer_id.csv': ('調教師', 'trainer_id'),
            'owner_id.csv': ('馬主', 'owner_id'),
            'breeder_id.csv': ('生産者', 'breeder_id')
        }
        
        for master_file, (col_name, id_type) in id_columns.items():
            master_path = os.path.join(LocalPaths.MASTER_DIR, master_file)
            
            if col_name == 'index':
                # horse_idの場合
                unique_ids = horse_info_updated.index.dropna().unique()
            else:
                # その他のIDの場合
                if col_name in horse_info_updated.columns:
                    unique_ids = horse_info_updated[col_name].dropna().unique()
                    unique_ids = [str(x) for x in unique_ids if str(x) not in ['nan', 'NaN', '']]
                else:
                    continue
            
            # 既存のマスターファイルを読み込み
            if os.path.exists(master_path):
                existing_master = pd.read_csv(master_path)
                existing_ids = set(existing_master.iloc[:, 0].dropna().astype(str))
            else:
                existing_ids = set()
                existing_master = pd.DataFrame(columns=[id_type, 'encoded_id'])
            
            # 新しいIDを追加
            new_ids = [id for id in unique_ids if str(id) not in existing_ids]
            
            if new_ids:
                # 新しいエンコードIDを生成
                max_encoded = existing_master['encoded_id'].max() if len(existing_master) > 0 else -1
                new_encoded = list(range(max_encoded + 1, max_encoded + 1 + len(new_ids)))
                
                # 新しいエントリを作成
                new_entries = pd.DataFrame({
                    id_type: new_ids,
                    'encoded_id': new_encoded
                })
                
                # マスターファイルを更新
                updated_master = pd.concat([existing_master, new_entries], ignore_index=True)
                updated_master.to_csv(master_path, index=False)
                
                print(f"{master_file}: {len(new_ids)}個の新しいIDを追加")
        
        print("\nマスターファイル再生成完了！")
        print("これで特徴量エンジニアリングを再実行できます。")
        
    except Exception as e:
        print(f"マスターファイル再生成エラー: {e}")
        
else:
    print("データ再処理を実行するには 'reprocess_data = True' を設定してください")

## 再スクレイピング機能の使い方

### 手順:
1. **NaN値検出**: セル24を実行してマスターファイル内のNaN値を特定
2. **再スクレイピング実行**: セル25で `re_scrape_horses = True` のコメントアウトを外して実行
3. **データ再処理**: セル26で `reprocess_data = True` のコメントアウトを外して実行

### 注意事項:
- 再スクレイピングには時間がかかる場合があります
- サーバー負荷軽減のため、適度な間隔で実行されます
- バックアップファイルが自動生成されます

In [None]:
# data/html/horse/に保存されているhtml(binファイル)をリストにする

import glob
import os

horse_html_dir = LocalPaths.HTML_HORSE_DIR
print(f"馬HTMLディレクトリ: {horse_html_dir}")

html_files_horse = glob.glob(os.path.join(horse_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_horse)}")

# 最初の5ファイルを表示
html_files_horse[:5]


In [None]:
# 馬の基本情報テーブルの作成（修正済み関数使用）
%autoreload
horse_info_new = preparing.get_rawdata_horse_info(html_files_horse)

In [None]:
# get_rawdata_horse_info関数のデバッグ: 1つのHTMLファイルで詳細確認
import pandas as pd
import re
from bs4 import BeautifulSoup
from numpy import nan as NaN

# テスト用に1つのHTMLファイルを選択
test_html_file = html_files_horse[0]
print(f"テスト対象ファイル: {test_html_file}")

# ファイルを読み込み
with open(test_html_file, 'rb') as f:
    html = f.read()

# horse_idを取得
horse_id = re.findall(r'horse\W(\d+)\.bin', test_html_file)[0]
print(f"horse_id: {horse_id}")

# HTMLファイルサイズとデコード確認
print(f"HTMLファイルサイズ: {len(html)} bytes")

# EUC-JPでデコード試行
for enc in ('euc-jp', 'cp932', 'utf-8'):
    try:
        text = html.decode(enc)
        print(f"デコード成功: {enc}")
        break
    except Exception as e:
        print(f"デコード失敗 {enc}: {e}")
        text = html.decode(enc, errors='ignore')
        print(f"エラー無視でデコード: {enc}")
        break

# BeautifulSoupで解析
soup = BeautifulSoup(text, 'lxml')
print(f"BeautifulSoup解析完了")

# プロフィールテーブルの確認
prof_table = (
    soup.select_one('table.db_prof_table[summary*="プロフィール"]')
    or soup.find('table', attrs={'summary': re.compile('プロフィール')})
)

if prof_table:
    print("プロフィールテーブル発見!")
    print(f"テーブルHTML（最初の500文字）: {str(prof_table)[:500]}")
    
    # テーブルを読み込む
    try:
        df = pd.read_html(str(prof_table), flavor='lxml')[0]
        print(f"DataFrame形状: {df.shape}")
        print(f"DataFrame列名: {list(df.columns)}")
        print("DataFrame内容:")
        display(df)
        
        # 2列形式の確認
        if df.shape[1] >= 2:
            df = df.iloc[:, :2]
            df.columns = ['項目', '値']
            df_info = df.set_index('項目').T
            print("転置後:")
            display(df_info)
        else:
            print(f"プロフィール表の列数が想定外: {df.shape[1]}")
            
    except Exception as e:
        print(f"pd.read_htmlエラー: {e}")
else:
    print("プロフィールテーブルが見つかりません")
    # 代替手段: すべてのテーブルを確認
    print("すべてのテーブルを確認:")
    tables = soup.find_all('table')
    for i, table in enumerate(tables):
        attrs = table.attrs
        print(f"テーブル {i+1}: {attrs}")
        if i < 3:  # 最初の3テーブルの内容を確認
            try:
                df_temp = pd.read_html(str(table))[0]
                print(f"  形状: {df_temp.shape}, 列名: {list(df_temp.columns[:3])}")
                if len(df_temp) > 0:
                    print(f"  最初の行: {df_temp.iloc[0].values[:3]}")
            except Exception as e:
                print(f"  読み込みエラー: {e}")

# ID抽出の確認
def extract_id(selector, pattern):
    a = soup.select_one(selector)
    if a and a.has_attr('href'):
        m = re.search(pattern, a['href'])
        if m:
            return m.group(1)
    return NaN

trainer_id = extract_id('a[href^="/trainer/"]', r'/trainer/([^/]+)/')
owner_id   = extract_id('a[href^="/owner/"]',   r'/owner/([^/]+)/')
breeder_id = extract_id('a[href^="/breeder/"]', r'/breeder/([^/]+)/')

print(f"trainer_id: {trainer_id}")
print(f"owner_id: {owner_id}")
print(f"breeder_id: {breeder_id}")

# 関連リンクの確認
trainer_links = soup.select('a[href^="/trainer/"]')
owner_links = soup.select('a[href^="/owner/"]')
breeder_links = soup.select('a[href^="/breeder/"]')

print(f"調教師リンク数: {len(trainer_links)}")
print(f"馬主リンク数: {len(owner_links)}")
print(f"生産者リンク数: {len(breeder_links)}")

if trainer_links:
    print(f"調教師リンク例: {trainer_links[0].get('href')}")
if owner_links:
    print(f"馬主リンク例: {owner_links[0].get('href')}")
if breeder_links:
    print(f"生産者リンク例: {breeder_links[0].get('href')}")

In [None]:
# エンコーディング問題の詳細調査（chardetなし）
# 手動で各エンコーディングを試行

encodings_to_try = ['utf-8', 'euc-jp', 'cp932', 'shift_jis', 'iso-2022-jp']

successful_encoding = None
for encoding in encodings_to_try:
    try:
        text_decoded = html.decode(encoding)
        print(f"デコード成功: {encoding}")
        
        # BeautifulSoupで解析
        soup_test = BeautifulSoup(text_decoded, 'lxml')
        
        # プロフィールテーブルの検索
        prof_table_test = (
            soup_test.find('table', class_='db_prof_table') or
            soup_test.find('table', attrs={'summary': re.compile('プロフィール')}) or
            soup_test.select_one('table[summary*="プロフィール"]')
        )
        
        if prof_table_test:
            print(f"  → プロフィールテーブル発見！エンコーディング: {encoding}")
            print(f"  → テーブル属性: {prof_table_test.attrs}")
            
            # テーブル内容を確認
            try:
                df_test = pd.read_html(str(prof_table_test))[0]
                print(f"  → DataFrame形状: {df_test.shape}")
                print("  → DataFrame内容:")
                display(df_test)
                
                # 2列形式に変換してみる
                if df_test.shape[1] >= 2:
                    df_test = df_test.iloc[:, :2]
                    df_test.columns = ['項目', '値']
                    df_info_test = df_test.set_index('項目').T
                    print("  → 転置後:")
                    display(df_info_test)
                    
                successful_encoding = encoding
                break
                
            except Exception as e:
                print(f"  → pd.read_htmlエラー: {e}")
        else:
            print(f"  → プロフィールテーブル見つからず")
            
    except UnicodeDecodeError as e:
        print(f"デコード失敗: {encoding} - {str(e)[:100]}")
        continue
    except Exception as e:
        print(f"その他のエラー {encoding}: {e}")
        continue

if successful_encoding:
    print(f"\n成功したエンコーディング: {successful_encoding}")
    
    # 正しいエンコーディングでの最終確認
    text_final = html.decode(successful_encoding)
    soup_final = BeautifulSoup(text_final, 'lxml')
    
    # すべてのテーブルを確認
    tables_final = soup_final.find_all('table')
    print(f"\n全テーブル数: {len(tables_final)}")
    for i, table in enumerate(tables_final):
        attrs = table.attrs
        summary = attrs.get('summary', '')
        class_names = attrs.get('class', [])
        print(f"テーブル {i+1}: class={class_names}, summary='{summary}'")
        
        # プロフィールらしいテーブルを詳しく確認
        if 'db_prof_table' in class_names or 'プロフィール' in summary:
            try:
                df_detail = pd.read_html(str(table))[0]
                print(f"  → 詳細形状: {df_detail.shape}")
                print(f"  → 列名: {list(df_detail.columns)}")
                if len(df_detail) > 0:
                    print(f"  → 最初の行: {df_detail.iloc[0].values}")
            except Exception as e:
                print(f"  → 読み込みエラー: {e}")
                
else:
    print("\nどのエンコーディングでもプロフィールテーブルが見つかりませんでした")
    
    # HTMLの先頭部分を確認
    print(f"\nHTML先頭500文字（バイナリ）:")
    print(html[:500])
    
    # UTF-8での強制解析
    try:
        text_force = html.decode('utf-8', errors='ignore')
        print(f"\nUTF-8強制デコード後の先頭1000文字:")
        print(text_force[:1000])
    except Exception as e:
        print(f"UTF-8強制デコードエラー: {e}")

In [None]:
# get_rawdata_horse_info関数の修正版を作成・テスト
def get_rawdata_horse_info_fixed(html_path_list: list):
    """
    horseページのhtmlを受け取って、馬の基本情報のDataFrameに変換する関数（修正版）。
    - UTF-8優先でデコード
    - プロフィールテーブルを確実に特定
    - 調教師/馬主/生産者IDを確実に抽出
    """
    print('preparing raw horse_info table (fixed version)')
    out_rows = []

    for html_path in tqdm(html_path_list):
        try:
            with open(html_path, 'rb') as f:
                raw = f.read()

            # 1) エンコーディング優先順位: UTF-8 → EUC-JP → CP932
            text = None
            for encoding in ['utf-8', 'euc-jp', 'cp932']:
                try:
                    text = raw.decode(encoding)
                    break
                except UnicodeDecodeError:
                    continue
            
            if text is None:
                print(f'エンコーディング失敗: {html_path}')
                continue

            soup = BeautifulSoup(text, 'lxml')

            # 2) プロフィールテーブルの確実な特定
            prof_table = (
                soup.find('table', class_='db_prof_table') or
                soup.find('table', attrs={'summary': re.compile('プロフィール')}) or
                soup.select_one('table[summary*="プロフィール"]')
            )
            
            if prof_table is None:
                print(f'プロフィールテーブル見つからず: {html_path}')
                continue

            # 3) テーブルを読み込む（StringIOを使用して警告を回避）
            from io import StringIO
            df = pd.read_html(StringIO(str(prof_table)))[0]
            
            # 左列を項目名、右列を値として転置（1行化）
            if df.shape[1] >= 2:
                df = df.iloc[:, :2]
                df.columns = ['項目', '値']
                df_info = df.set_index('項目').T
            else:
                print(f'プロフィールテーブルの列数が想定外: {html_path}')
                continue

            # 4) 各IDをより確実に抽出
            def extract_id(selector, pattern):
                a = soup.select_one(selector)
                if a and a.has_attr('href'):
                    m = re.search(pattern, a['href'])
                    if m:
                        return m.group(1)
                return NaN

            trainer_id = extract_id('a[href^="/trainer/"]', r'/trainer/([^/]+)/')
            owner_id   = extract_id('a[href^="/owner/"]',   r'/owner/([^/]+)/')
            breeder_id = extract_id('a[href^="/breeder/"]', r'/breeder/([^/]+)/')

            df_info['trainer_id'] = trainer_id
            df_info['owner_id']   = owner_id
            df_info['breeder_id'] = breeder_id

            # 5) インデックスを horse_id に
            horse_id_m = re.search(r'horse\W(\d+)\.bin', html_path)
            if horse_id_m:
                horse_id = horse_id_m.group(1)
                df_info.index = [horse_id]
                out_rows.append(df_info)
            else:
                print(f'horse_id抽出失敗: {html_path}')
                
        except Exception as e:
            print(f'処理エラー {html_path}: {e}')
            continue

    if not out_rows:
        print('処理できたhorse_infoデータがありません')
        return pd.DataFrame()

    horse_info_df = pd.concat(out_rows, axis=0)
    print(f'horse_info処理完了: {horse_info_df.shape}')
    return horse_info_df

# 修正版でテスト実行
horse_info_new_fixed = get_rawdata_horse_info_fixed(html_files_horse)
print(f"\n修正版の結果: {horse_info_new_fixed.shape}")
if len(horse_info_new_fixed) > 0:
    print(f"列名: {list(horse_info_new_fixed.columns[:5])}")
    print(f"最初の数行:")
    display(horse_info_new_fixed.head())

In [None]:
# horse_info.pickleを読み込み
try:
    horse_info = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
    print(f"horse_info.pickle読み込み完了: {len(horse_info)}頭の馬データ")
except Exception as e:
    print(f"horse_info.pickleの読み込みエラー: {e}")
    horse_info = None

In [None]:
# 馬の基本情報テーブルの更新
preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info)

In [None]:
# 馬の過去成績テーブルの作成
horse_results_new = preparing.get_rawdata_horse_results(html_files_horse)

In [None]:
# 結果確認
print(f"horse_results_new shape: {horse_results_new.shape}")
print(f"horse_results_new type: {type(horse_results_new)}")
if len(horse_results_new) > 0:
    print(f"列名: {list(horse_results_new.columns[:5])}")
    print(f"最初の数行:")
    display(horse_results_new.head())

In [None]:
# 問題診断: 1つのHTMLファイルでpd.read_htmlの動作確認
import pandas as pd
import re
from bs4 import BeautifulSoup

# テスト用に1つのHTMLファイルを選択
test_html_file = html_files_horse[0]
print(f"テスト対象ファイル: {test_html_file}")

# ファイルを読み込み
with open(test_html_file, 'rb') as f:
    html = f.read()

# horse_idを取得
horse_id = re.findall(r'horse\W(\d+)\.bin', test_html_file)[0]
print(f"horse_id: {horse_id}")

# BeautifulSoupで解析
soup = BeautifulSoup(html, "lxml")
print(f"HTMLファイルサイズ: {len(html)} bytes")

# テーブル要素の確認
tables = soup.find_all('table')
print(f"テーブル数: {len(tables)}")

for i, table in enumerate(tables):
    class_names = table.get('class', [])
    print(f"テーブル {i+1}: class={class_names}")

# pd.read_htmlでテーブル読み込みテスト
try:
    dfs = pd.read_html(html)
    print(f"pd.read_htmlで読み込めたテーブル数: {len(dfs)}")
    
    for i, df in enumerate(dfs):
        print(f"DataFrame {i+1}: 形状={df.shape}, 列名={list(df.columns[:3])}")
        if len(df) > 0:
            print(f"  最初の行: {df.iloc[0].values[:3]}")
        print()
        
except Exception as e:
    print(f"pd.read_htmlエラー: {e}")

# 過去成績テーブルの直接確認
race_results_table = soup.find('table', class_='db_h_race_results')
if race_results_table:
    print("過去成績テーブルが見つかりました!")
    rows = race_results_table.find_all('tr')
    print(f"行数: {len(rows)}")
    if len(rows) > 1:  # ヘッダー行を除く
        first_data_row = rows[1]
        cells = first_data_row.find_all(['td', 'th'])
        print(f"最初のデータ行のセル数: {len(cells)}")
        print(f"最初のセルの内容: {cells[0].get_text(strip=True) if cells else 'なし'}")
else:
    print("過去成績テーブルが見つかりませんでした")

In [None]:
# get_rawdata_horse_resultsの修正版をテスト
import pandas as pd
import re
from tqdm.auto import tqdm

def get_rawdata_horse_results_fixed(html_path_list: list):
    """
    horseページのhtmlを受け取って、馬の過去成績のDataFrameに変換する関数。
    AJAX実装対応版: 過去成績テーブルはインデックス1（2番目）にある
    """
    print('preparing raw horse_results table (fixed version)')
    horse_results = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, 'rb') as f:
            try:
                # 保存してあるbinファイルを読み込む
                html = f.read()

                # AJAX実装では、過去成績テーブルは2番目（インデックス1）
                dfs = pd.read_html(html)
                
                # テーブル数の確認
                if len(dfs) < 2:
                    print(f'horse_results insufficient tables: {len(dfs)} tables in {html_path}')
                    continue
                
                # 過去成績テーブルは2番目（インデックス1）
                df = dfs[1]
                
                # 新馬の競走馬レビューが付いた場合、
                # 列名に0が付与されるため、次のhtmlへ飛ばす
                if df.columns[0] == 0:
                    print('horse_results empty case1 {}'.format(html_path))
                    continue

                horse_id = re.findall(r'horse\W(\d+)\.bin', html_path)[0]

                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                print(f'Successfully processed {horse_id}: {df.shape}')

            # 競走データが無い場合（新馬）を飛ばす
            except IndexError:
                print('horse_results empty case2 {}'.format(html_path))
                continue
            except Exception as e:
                print(f'horse_results error in {html_path}: {e}')
                continue

    if not horse_results:
        print("警告: 処理できた過去成績データがありません")
        return pd.DataFrame()

    # pd.DataFrame型にして一つのデータにまとめる
    horse_results_df = pd.concat([horse_results[key] for key in horse_results])

    # 列名に半角スペースがあれば除去する
    horse_results_df = horse_results_df.rename(columns=lambda x: x.replace(' ', ''))

    return horse_results_df

# 修正版でテスト実行
horse_results_new_fixed = get_rawdata_horse_results_fixed(html_files_horse)
print(f"\n修正版の結果: {horse_results_new_fixed.shape}")
if len(horse_results_new_fixed) > 0:
    print(f"列名: {list(horse_results_new_fixed.columns[:5])}")
    print(f"最初の数行:")
    display(horse_results_new_fixed.head())

In [None]:
# テーブルの更新
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_new)

In [None]:
display(horse_info_new)

# 馬の基本情報テーブルの行数を取得
print(f"馬の基本情報テーブルの行数: {len(horse_info_new)}")


In [None]:
display(horse_results_new)

### 2.3.2 特定期間の再スクレイピング

In [None]:
import pandas as pd

from modules import preparing
from modules.constants import LocalPaths

# 1) 対象期間
START = "2025-12-01"
END   = "2026-01-01"  # この日付は含めない想定（必要なら調整）

# 2) 開催日(yyyymmdd) -> レースID取得
# scrape_kaisai_date は yyyy-mm を受け取り「to_月は含まない」仕様
kaisai_date_list = preparing.scrape_kaisai_date("2025-12", "2026-01")

# 念のため日付でフィルタ（文字列比較でOK: yyyymmdd）
start_yyyymmdd = START.replace("-", "")
end_yyyymmdd   = END.replace("-", "")
kaisai_date_list = [d for d in kaisai_date_list if start_yyyymmdd <= d < end_yyyymmdd]

race_id_list = preparing.scrape_race_id_list(kaisai_date_list)
race_id_list = sorted(set(race_id_list))
print("race_id count:", len(race_id_list))

# 3) レースHTMLを再取得（skip=False で上書き）
race_html_paths = preparing.scrape_html_race(race_id_list, skip=False)
print("race html updated:", len(race_html_paths))

# 4) レース結果→出走馬ID抽出
results_new = preparing.get_rawdata_results(race_html_paths)
horse_id_list = sorted(set(results_new["horse_id"].dropna().astype(str).tolist()))
print("horse_id count:", len(horse_id_list))

# （任意だが推奨）レース系rawも同時に更新して整合を取る
race_info_new = preparing.get_rawdata_info(race_html_paths)
return_new = preparing.get_rawdata_return(race_html_paths)

preparing.update_rawdata(LocalPaths.RAW_RESULTS_PATH, results_new, mode="replace")
preparing.update_rawdata(LocalPaths.RAW_RACE_INFO_PATH, race_info_new, mode="replace")
preparing.update_rawdata(LocalPaths.RAW_RETURN_TABLES_PATH, return_new, mode="replace")

# 5) 馬ページ（戦績断片を挿入する実装）を再取得して、更新日時マスタも更新
horse_html_paths = preparing.scrape_html_horse_with_master(horse_id_list, skip=False)
print("horse html updated:", len(horse_html_paths))

# 6) horse_info / horse_results を作ってraw pickle更新
horse_info_new = preparing.get_rawdata_horse_info(horse_html_paths)
horse_results_new = preparing.get_rawdata_horse_results(horse_html_paths)

preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info_new, mode="replace")
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_new, mode="replace")

print("done")

## 2.4. /ped/ディレクトリのデータ取得

In [None]:
html_files_peds = preparing.scrape_html_ped(horse_id_list, skip=True) #htmlをスクレイピング

In [None]:
# === 血統データ完全スクレイピングの実装 ===
import os
from modules.preparing import scrape_html_ped

# 1. 全てのHTMLファイルから完全な馬IDリストを作成
complete_horse_id_list = [os.path.splitext(os.path.basename(f))[0] for f in html_files_horse]
print(f"完全な馬IDリスト作成完了: {len(complete_horse_id_list)}件")

# 2. 既存の血統データを確認
existing_peds_ids = set([os.path.splitext(os.path.basename(f))[0] for f in html_files_peds])
print(f"既存の血統データ: {len(existing_peds_ids)}件")

# 3. 不足している血統データのIDを特定
missing_ped_ids = set(complete_horse_id_list) - existing_peds_ids
missing_ped_ids_list = sorted(list(missing_ped_ids))
print(f"不足している血統データ: {len(missing_ped_ids_list)}件")

# 4. 年代別の不足状況確認
missing_by_year = {}
for horse_id in missing_ped_ids_list:
    year = horse_id[:4] if horse_id[:4].isdigit() else "unknown"
    missing_by_year[year] = missing_by_year.get(year, 0) + 1

print(f"\n年代別の不足血統データ:")
for year in sorted(missing_by_year.keys()):
    print(f"  {year}年: {missing_by_year[year]}件")

print(f"\n=== スクレイピング実行 ===")
print(f"対象馬数: {len(missing_ped_ids_list)}件")
print(f"推定時間: {len(missing_ped_ids_list) * 2 / 60:.1f}分")

# 5. 不足している血統データをスクレイピング実行
# 警告: これは時間がかかる処理です（約163分）
# バッチ処理で実行することを推奨
print("\n血統データスクレイピングを開始...")
new_ped_files = scrape_html_ped(missing_ped_ids_list, skip=False)
print(f"スクレイピング完了: {len(new_ped_files)}件の新しい血統データを取得")

# 6. 結果確認
import glob
updated_html_files_peds = glob.glob(os.path.join(LocalPaths.HTML_PED_DIR, "*.bin"))
print(f"更新後の血統データファイル数: {len(updated_html_files_peds)}")
print(f"馬情報ファイル数: {len(html_files_horse)}")
print(f"差分: {len(html_files_horse) - len(updated_html_files_peds)}")

In [None]:
# まず少数でテスト（最初の100件）
test_missing_ids = missing_ped_ids_list[:100]
print(f"テスト対象: {len(test_missing_ids)}件")
test_ped_files = scrape_html_ped(test_missing_ids, skip=False)
print(f"テスト完了: {len(test_ped_files)}件")

# 成功を確認後、残りを実行
remaining_ids = missing_ped_ids_list[100:]
remaining_ped_files = scrape_html_ped(remaining_ids, skip=False)

In [None]:
# 血統データスクレイピングのテスト: 少数の馬でテスト実行
# 先頭5頭の馬IDを使用してテスト
test_horse_ids = horse_id_test_list[:5]  # 最初の5頭でテスト
print(f"テスト対象馬ID: {test_horse_ids}")

%autoreload

In [None]:
# 血統HTMLファイルのスクレイピングテスト（5頭のみ）
print("血統HTMLファイルのスクレイピングを開始...")
html_files_peds_test = preparing.scrape_html_ped(test_horse_ids, skip=False)
print(f"スクレイピング完了: {len(html_files_peds_test)}件のHTMLファイル")

# 取得されたファイルパスの確認
if html_files_peds_test:
    print("取得されたファイル:")
    for i, file_path in enumerate(html_files_peds_test[:3]):  # 最初の3件表示
        print(f"  {i+1}: {file_path}")
    if len(html_files_peds_test) > 3:
        print(f"  ... 他{len(html_files_peds_test)-3}件")
else:
    print("取得されたHTMLファイルがありません（既存ファイルがスキップされた可能性）")

In [None]:
# 血統テーブルの作成テスト
print("血統テーブルの作成を開始...")
peds_test = preparing.get_rawdata_peds(html_files_peds_test)

print(f"血統テーブル作成完了: {peds_test.shape}")
if len(peds_test) > 0:
    print(f"列名: {list(peds_test.columns)}")
    print("血統データサンプル:")
    display(peds_test.head())
    
    # 各列のnull値確認
    print("\n各列のnull値の数:")
    print(peds_test.isnull().sum())
else:
    print("血統テーブルが空です。HTMLファイルの構造を確認します。")

In [None]:
# 血統HTMLファイルの構造解析
import re
from bs4 import BeautifulSoup

# テスト用に1つの血統HTMLファイルを詳しく調べる
test_ped_file = html_files_peds_test[0]
print(f"解析対象ファイル: {test_ped_file}")

# ファイルを読み込み
with open(test_ped_file, 'rb') as f:
    ped_html = f.read()

print(f"HTMLファイルサイズ: {len(ped_html)} bytes")

# エンコーディングテスト
encodings = ['utf-8', 'euc-jp', 'cp932']
ped_text = None
successful_encoding = None

for encoding in encodings:
    try:
        ped_text = ped_html.decode(encoding)
        successful_encoding = encoding
        print(f"デコード成功: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"デコード失敗: {encoding}")
        continue

if ped_text:
    # BeautifulSoupで解析
    ped_soup = BeautifulSoup(ped_text, 'lxml')
    
    # HTMLの基本情報
    title = ped_soup.find('title')
    print(f"ページタイトル: {title.text if title else 'なし'}")
    
    # テーブル要素の確認
    tables = ped_soup.find_all('table')
    print(f"テーブル数: {len(tables)}")
    
    for i, table in enumerate(tables[:5]):  # 最初の5テーブルを確認
        attrs = table.attrs
        class_names = attrs.get('class', [])
        summary = attrs.get('summary', '')
        print(f"テーブル {i+1}: class={class_names}, summary='{summary}'")
        
        # テーブル内容の簡単な確認
        try:
            df_ped = pd.read_html(str(table))[0]
            print(f"  形状: {df_ped.shape}")
            if len(df_ped) > 0 and df_ped.shape[1] > 0:
                print(f"  列名: {list(df_ped.columns[:3])}")
                print(f"  最初の行: {df_ped.iloc[0].values[:3]}")
        except Exception as e:
            print(f"  読み込みエラー: {e}")
        
        # 血統らしいキーワードを含むテーブルかチェック
        table_text = table.get_text()
        if any(keyword in table_text for keyword in ['父', '母', '祖父', '血統']):
            print(f"  → 血統関連テーブルの可能性あり")
    
    # 血統ツリー構造の確認（td要素のクラス名など）
    print("\n血統ツリー関連の要素:")
    bloodline_elements = ped_soup.find_all(['td', 'div'], class_=re.compile(r'(blood|ped|pedigree)', re.I))
    print(f"血統関連要素数: {len(bloodline_elements)}")
    
    if bloodline_elements:
        for elem in bloodline_elements[:3]:
            print(f"  要素: {elem.name}, class: {elem.get('class')}, text: {elem.get_text()[:50]}...")
            
else:
    print("すべてのエンコーディングでデコードに失敗しました")

In [None]:
# get_rawdata_peds関数の修正版を作成・テスト
def get_rawdata_peds_fixed(html_path_list: list):
    """
    horse/pedページのhtmlを受け取って、血統のDataFrameに変換する関数（修正版）。
    - EUC-JP優先でデコード
    - 血統テーブルを確実に特定
    - 血統horse_idを確実に抽出
    """
    print('preparing raw peds table (fixed version)')
    peds = {}
    
    for html_path in tqdm(html_path_list):
        try:
            with open(html_path, 'rb') as f:
                raw = f.read()

            # 1) エンコーディング優先順位: EUC-JP → UTF-8 → CP932
            text = None
            for encoding in ['euc-jp', 'utf-8', 'cp932']:
                try:
                    text = raw.decode(encoding)
                    break
                except UnicodeDecodeError:
                    continue
            
            if text is None:
                print(f'エンコーディング失敗: {html_path}')
                continue

            # horse_idを取得
            horse_id = re.findall(r'ped\W(\d+)\.bin', html_path)[0]

            # htmlをsoupオブジェクトに変換
            soup = BeautifulSoup(text, "lxml")

            peds_id_list = []

            # 2) 血統データからhorse_idを取得する
            blood_table = soup.find("table", attrs={"summary": "5代血統表"})
            if blood_table is None:
                # 代替検索
                blood_table = soup.find("table", class_="blood_table")
                
            if blood_table is None:
                print(f'血統テーブル見つからず: {html_path}')
                continue

            horse_a_list = blood_table.find_all("a", attrs={"href": re.compile(r"^/horse/\w{10}")})

            for a in horse_a_list:
                # 血統データのhorse_idを抜き出す
                work_peds_id = re.findall(r'/horse/(\w{10})', a["href"])[0]
                peds_id_list.append(work_peds_id)

            peds[horse_id] = peds_id_list
            print(f'血統ID取得成功 {horse_id}: {len(peds_id_list)}個')

        except Exception as e:
            print(f'処理エラー {html_path}: {e}')
            continue

    if not peds:
        print('処理できた血統データがありません')
        return pd.DataFrame()

    # pd.DataFrame型にして一つのデータにまとめて、列名をpeds_0, ..., peds_61にする
    peds_df = pd.DataFrame.from_dict(peds, orient='index').add_prefix('peds_')
    print(f'血統データ処理完了: {peds_df.shape}')
    return peds_df

# 修正版でテスト実行
peds_test_fixed = get_rawdata_peds_fixed(html_files_peds_test)
print(f"\n修正版の結果: {peds_test_fixed.shape}")
if len(peds_test_fixed) > 0:
    print(f"列名サンプル: {list(peds_test_fixed.columns[:10])}")
    print("血統データサンプル:")
    display(peds_test_fixed.head())

In [None]:
# 血統テーブル内のリンク構造を詳しく調査
test_ped_file = html_files_peds_test[0]

with open(test_ped_file, 'rb') as f:
    raw = f.read()

# EUC-JPでデコード
text = raw.decode('euc-jp')
soup = BeautifulSoup(text, "lxml")

# 血統テーブルを取得
blood_table = soup.find("table", attrs={"summary": "5代血統表"})
if blood_table:
    print("血統テーブル発見!")
    
    # テーブル内のすべてのリンクを確認
    all_links = blood_table.find_all("a")
    print(f"血統テーブル内の全リンク数: {len(all_links)}")
    
    # 最初の10リンクを詳しく確認
    for i, link in enumerate(all_links[:10]):
        href = link.get('href', '')
        text_content = link.get_text()
        print(f"リンク {i+1}: href='{href}', text='{text_content}'")
    
    # horse関連のリンクパターンを確認
    horse_patterns = [
        r"^/horse/\w{10}",  # 現在のパターン
        r"/horse/",         # より広いパターン
        r"horse",           # 最も広いパターン
    ]
    
    for pattern in horse_patterns:
        matches = blood_table.find_all("a", attrs={"href": re.compile(pattern)})
        print(f"パターン '{pattern}' にマッチするリンク数: {len(matches)}")
        if matches:
            for j, match in enumerate(matches[:3]):
                print(f"  例{j+1}: {match.get('href')}")
    
    # 実際のhorse_idの抽出テスト
    print("\n実際のhorse_id抽出テスト:")
    for link in all_links[:5]:
        href = link.get('href', '')
        if '/horse/' in href:
            print(f"リンク: {href}")
            # 異なる抽出パターンをテスト
            patterns = [
                r'/horse/(\w{10})',
                r'/horse/(\w+)',
                r'horse/(\w+)',
            ]
            for p in patterns:
                matches = re.findall(p, href)
                if matches:
                    print(f"  パターン '{p}': {matches}")
else:
    print("血統テーブルが見つかりません")

In [None]:
# 修正版のhorse_id抽出パターンを作成・テスト
import re

# 正しいパターンで抽出テスト
correct_pattern = r'https://db\.netkeiba\.com/horse/(\w{10})/$'

test_ped_file = html_files_peds_test[0]
with open(test_ped_file, 'rb') as f:
    raw = f.read()

text = raw.decode('euc-jp')
soup = BeautifulSoup(text, "lxml")
blood_table = soup.find("table", attrs={"summary": "5代血統表"})

if blood_table:
    # 馬の詳細ページのみを抽出
    horse_links = blood_table.find_all("a", attrs={"href": re.compile(correct_pattern)})
    print(f"馬の詳細ページリンク数: {len(horse_links)}")
    
    # horse_idを抽出
    horse_ids = []
    for link in horse_links:
        href = link.get('href')
        match = re.search(correct_pattern, href)
        if match:
            horse_id = match.group(1)
            horse_ids.append(horse_id)
            horse_name = link.get_text().strip()
            print(f"  {horse_id}: {horse_name}")
    
    print(f"\n抽出されたhorse_id数: {len(horse_ids)}")
    print(f"ユニークなhorse_id数: {len(set(horse_ids))}")
else:
    print("血統テーブルが見つかりません")

In [None]:
# 修正版get_rawdata_peds関数を作成・テスト
def get_rawdata_peds_fixed(horse_id_list):
    """
    血統データを取得する修正版関数
    """
    
    peds_new = pd.DataFrame()
    
    for horse_id in tqdm(horse_id_list):
        try:
            # ファイルパスを作成
            html_path = f"data/html/ped/{horse_id}.bin"
            
            # ファイルが存在するかチェック
            if not os.path.exists(html_path):
                print(f"ファイルが見つかりません: {html_path}")
                continue
            
            # HTMLファイルを読み込み
            with open(html_path, 'rb') as f:
                raw = f.read()
            
            # エンコーディングを試行（UTF-8 → EUC-JP → CP932）
            for encoding in ['utf-8', 'euc-jp', 'cp932']:
                try:
                    text = raw.decode(encoding)
                    break
                except UnicodeDecodeError:
                    continue
            else:
                print(f"デコードに失敗しました: {horse_id}")
                continue
            
            # BeautifulSoupでパース
            soup = BeautifulSoup(text, "lxml")
            
            # 血統テーブルを検索
            blood_table = soup.find("table", attrs={"summary": "5代血統表"})
            
            if blood_table is None:
                print(f"血統テーブルが見つかりません: {horse_id}")
                continue
            
            # 修正された正規表現パターンで血統IDを抽出
            pattern = r'https://db\.netkeiba\.com/horse/(\w{10})/$'
            horse_links = blood_table.find_all("a", attrs={"href": re.compile(pattern)})
            
            # horse_idを抽出
            peds_horse_ids = []
            for link in horse_links:
                href = link.get('href')
                match = re.search(pattern, href)
                if match:
                    peds_horse_ids.append(match.group(1))
            
            # 結果をDataFrameに追加
            if peds_horse_ids:
                temp_df = pd.DataFrame({
                    'horse_id': [horse_id] * len(peds_horse_ids),
                    'peds_horse_id': peds_horse_ids
                })
                peds_new = pd.concat([peds_new, temp_df], ignore_index=True)
                print(f"血統ID取得成功 {horse_id}: {len(peds_horse_ids)}個")
            else:
                print(f"血統IDが取得できませんでした: {horse_id}")
                
        except Exception as e:
            print(f"エラーが発生しました {horse_id}: {e}")
            continue
    
    return peds_new

# テスト用の血統HTMLファイルから馬IDを抽出
test_horse_ids = []
for file_path in html_files_peds_test:
    file_name = os.path.basename(file_path)
    horse_id = file_name.replace('.bin', '')
    test_horse_ids.append(horse_id)

print(f"テスト用馬ID: {test_horse_ids}")

# テスト実行
print("修正版get_rawdata_peds関数をテスト中...")
peds_new_fixed = get_rawdata_peds_fixed(test_horse_ids)
print(f"\n修正版関数の結果: {peds_new_fixed.shape}")
print(peds_new_fixed.head(10))

In [None]:
# モジュールをリロードして修正版をテスト
import importlib
import sys

# モジュールをリロード
if 'modules.preparing._get_rawdata' in sys.modules:
    del sys.modules['modules.preparing._get_rawdata']

from modules.preparing._get_rawdata import get_rawdata_peds

# 修正版の関数でテスト
print("元ファイルの修正版get_rawdata_peds関数をテスト中...")
peds_from_module = get_rawdata_peds(html_files_peds_test)
print(f"\nモジュール関数の結果: {peds_from_module.shape}")
print(peds_from_module.head())

In [None]:
# 血統データスクレイピングテストの結果確認
print("=== 血統データスクレイピングテスト結果 ===")
print(f"テスト馬数: {len(test_horse_ids)}頭")
print(f"取得血統データ: {peds_from_module.shape[0]}頭 × {peds_from_module.shape[1]}血統ID")
print(f"総血統ID数: {peds_from_module.shape[0] * peds_from_module.shape[1]}個")

# 各馬の血統ID数を確認
print(f"\n各馬の血統ID数:")
for horse_id in test_horse_ids:
    if horse_id in peds_from_module.index:
        non_null_count = peds_from_module.loc[horse_id].notna().sum()
        print(f"  {horse_id}: {non_null_count}個")

# サンプル血統データを表示
print(f"\n血統データサンプル（馬ID: {test_horse_ids[0]}）:")
sample_row = peds_from_module.loc[test_horse_ids[0]]
sample_peds = sample_row.dropna().head(10)
for i, (col, horse_id) in enumerate(sample_peds.items()):
    print(f"  {col}: {horse_id}")

print("\n血統データテーブル作成テスト: ✅ 成功")

In [None]:
import glob
import os

# LocalPathsからHTML血統ディレクトリを取得
ped_html_dir = LocalPaths.HTML_PED_DIR
print(f"血統HTMLディレクトリ: {ped_html_dir}")

# globでbinファイルを検索
html_files_peds = glob.glob(os.path.join(ped_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_peds)}")

# 最初の5ファイルを表示
html_files_peds[:5]

In [None]:
# HTML数の詳細分析：馬の基本情報と血統データの差異調査
import os

# 馬の基本情報のIDリストを作成
horse_ids = set([os.path.splitext(os.path.basename(f))[0] for f in html_files_horse])
print(f"馬の基本情報のユニークID数: {len(horse_ids)}")

# 血統データのIDリストを作成
peds_ids = set([os.path.splitext(os.path.basename(f))[0] for f in html_files_peds])
print(f"血統データのユニークID数: {len(peds_ids)}")

print(f"\n数の違い: {len(horse_ids) - len(peds_ids)} 件")

# 血統データがない馬IDを特定
missing_peds = horse_ids - peds_ids
print(f"\n血統データがない馬IDの数: {len(missing_peds)}")

if len(missing_peds) > 0:
    print("\n血統データがない馬IDの例（最初の10件）:")
    missing_list = sorted(list(missing_peds))
    for horse_id in missing_list[:10]:
        print(f"  {horse_id}")
        
    # 年代別の分析
    print("\n年代別の分析（血統データ不足）:")
    year_analysis = {}
    for horse_id in missing_peds:
        year = horse_id[:4]  # 馬IDの最初の4文字が年
        year_analysis[year] = year_analysis.get(year, 0) + 1
    
    for year in sorted(year_analysis.keys()):
        print(f"  {year}年: {year_analysis[year]}件")

# 逆に馬の基本情報がない血統データがあるかも確認
missing_horse_info = peds_ids - horse_ids
print(f"\n馬の基本情報がない血統データの数: {len(missing_horse_info)}")
if len(missing_horse_info) > 0:
    print("馬の基本情報がない血統IDの例（最初の5件）:")
    for ped_id in list(missing_horse_info)[:5]:
        print(f"  {ped_id}")

In [None]:
# スクレイピングプロセスの差異調査
print("=== スクレイピングプロセス差異分析 ===")

# 1. データ収集時期の違いを確認
print("\n1. 収集データの時期分析:")
horse_years = {}
peds_years = {}

for f in html_files_horse:
    horse_id = os.path.splitext(os.path.basename(f))[0]
    year = horse_id[:4] if horse_id[:4].isdigit() else "unknown"
    horse_years[year] = horse_years.get(year, 0) + 1

for f in html_files_peds:
    ped_id = os.path.splitext(os.path.basename(f))[0]
    year = ped_id[:4] if ped_id[:4].isdigit() else "unknown"
    peds_years[year] = peds_years.get(year, 0) + 1

print("年別収集数の比較:")
all_years = sorted(set(horse_years.keys()) | set(peds_years.keys()))
for year in all_years:
    horse_count = horse_years.get(year, 0)
    peds_count = peds_years.get(year, 0)
    diff = horse_count - peds_count
    print(f"  {year}年: 馬情報{horse_count:5d}件, 血統{peds_count:5d}件, 差分{diff:5d}件")

# 2. ファイルサイズの分析（スクレイピング成功率の推定）
print("\n2. ファイルサイズ分析（サンプル）:")
import random

sample_horse_files = random.sample(html_files_horse, min(50, len(html_files_horse)))
sample_peds_files = random.sample(html_files_peds, min(50, len(html_files_peds)))

horse_sizes = [os.path.getsize(f) for f in sample_horse_files if os.path.exists(f)]
peds_sizes = [os.path.getsize(f) for f in sample_peds_files if os.path.exists(f)]

if horse_sizes and peds_sizes:
    print(f"馬情報ファイル平均サイズ: {sum(horse_sizes)/len(horse_sizes):.0f} bytes")
    print(f"血統ファイル平均サイズ: {sum(peds_sizes)/len(peds_sizes):.0f} bytes")
    print(f"小さすぎるファイル（<1000 bytes）の割合:")
    small_horse = sum(1 for s in horse_sizes if s < 1000) / len(horse_sizes) * 100
    small_peds = sum(1 for s in peds_sizes if s < 1000) / len(peds_sizes) * 100
    print(f"  馬情報: {small_horse:.1f}%")
    print(f"  血統: {small_peds:.1f}%")

In [None]:
peds_new = preparing.get_rawdata_peds(html_files_peds) #血統テーブルの作成
preparing.update_rawdata(LocalPaths.RAW_PEDS_PATH, peds_new) #テーブルの更新

In [None]:
display(peds_new)

# 3. データ加工

In [2]:
#モジュールを更新した際、notebookに反映させるために使用。
#すでにインポートしてあるモジュールの更新が反映される。
%autoreload

In [3]:
#前処理
results_processor = preprocessing.ResultsProcessor(filepath=LocalPaths.RAW_RESULTS_PATH)
race_info_processor = preprocessing.RaceInfoProcessor(filepath=LocalPaths.RAW_RACE_INFO_PATH)
return_processor = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)
horse_info_processor = preprocessing.HorseInfoProcessor(
    filepath=LocalPaths.RAW_HORSE_INFO_PATH)
horse_results_processor = preprocessing.HorseResultsProcessor(
    filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
peds_processor = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)

              着順  枠番  馬番         馬名  性齢    斤量    騎手     タイム     着差     単勝  \
202301010101   1   5   5    サトミノキラリ  牡2  55.0  横山武史  1:09.5    NaN    1.2   
202301010101   2   8   8     ベアゴーゴー  牝2  55.0   浜中俊  1:09.5     クビ    4.1   
202301010101   3   6   6  ハピアーザンエバー  牡2  55.0  藤岡佑介  1:10.0  2.1/2   59.9   
202301010101   4   4   4  デビルシズカチャン  牝2  55.0  ルメール  1:10.2  1.1/2   16.6   
202301010101   5   1   1   ウィスピースノー  牝2  55.0  吉田隼人  1:10.3    1/2   23.9   
...           ..  ..  ..        ...  ..   ...   ...     ...    ...    ...   
202509050612  10   4   6   ウルトラソニック  牡7  58.0  田口貫太  1:22.1    3/4  186.3   
202509050612  11   3   4      ミストレス  牝3  55.0  北村友一  1:22.4  1.1/2   24.1   
202509050612  12   6   9  ウイントワイライト  牝3  55.0    武豊  1:22.4     ハナ    4.5   
202509050612  13   4   5    グランデサラス  牡5  58.0  池添謙一  1:22.5    1/2   25.8   
202509050612  14   5   7  エヴァンスウィート  牝4  56.0  ルメール  1:22.5    1/2   19.5   

                人気       馬体重       調教師    horse_id jockey_id trainer_id  \


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


In [None]:
# 騎手成績特徴量（直近10・50レース複勝率）の作成
from modules.constants import LocalPaths
from modules import preprocessing
import os
import pandas as pd

# data/tmp ディレクトリを作成
os.makedirs(LocalPaths.TMP_DIR, exist_ok=True)

# RAW_HORSE_RESULTS から騎手複勝率特徴量を作成して保存
jockey_stats_processor = preprocessing.JockeyStatsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
jockey_stats = jockey_stats_processor.preprocessed_data
jockey_stats.to_pickle(LocalPaths.JOCKEY_STATS_PATH)
jockey_stats.head()

馬の過去成績を集計しつつ、前処理の済みの全てのテーブルをマージする処理

In [4]:
# ターゲットエンコーディング時に「馬の成績」として扱う項目
TARGET_COLS = [
        HorseResultsCols.RANK,
        HorseResultsCols.PRIZE,
        HorseResultsCols.RANK_DIFF, 
        'first_corner',
        'final_corner',
        'first_to_rank',
        'first_to_final',
        'final_to_rank',
        'time_seconds'
        ]
# horse_id列と共に、ターゲットエンコーディングの対象にする列
GROUP_COLS = [
        'course_len',
        'race_type',
        HorseResultsCols.PLACE
        ]

data_merger = preprocessing.DataMerger(
        results_processor,
        race_info_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
)
# 処理実行
data_merger.merge()

separating horse results by date


  0%|          | 0/639 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/639 [00:00<?, ?it/s]

In [5]:
#カテゴリ変数の処理
feature_enginnering = preprocessing.FeatureEngineering(data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

In [None]:
#保存
#tmpは一時保存用のディレクトリ
feature_enginnering.featured_data.to_pickle('data/tmp/featured_data_20251225.pickle')

# 4. 学習

In [29]:
keiba_ai = training.KeibaAIFactory.create(feature_enginnering.featured_data) #モデル作成
keiba_ai.train_with_tuning() #パラメータチューニングをして学習

[I 2025-12-27 01:26:06,580] A new study created in memory with name: no-name-2e743245-5aa2-4699-b9d1-944a95184901
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.288220:  14%|#4        | 1/7 [00:21<02:06, 21.16s/it][I 2025-12-27 01:26:28,013] Trial 0 finished with value: 0.28822048850714405 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.28822048850714405.
feature_fraction, val_score: 0.288220:  14%|#4        | 1/7 [00:21<02:06, 21.16s/it]

Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.138454	valid_1's binary_logloss: 0.28822
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.221479 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.288220:  29%|##8       | 2/7 [00:38<01:35, 19.07s/it][I 2025-12-27 01:26:45,604] Trial 1 finished with value: 0.2895829757905281 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.28822048850714405.
feature_fraction, val_score: 0.288220:  29%|##8       | 2/7 [00:38<01:35, 19.07s/it]

Early stopping, best iteration is:
[56]	valid_0's binary_logloss: 0.143124	valid_1's binary_logloss: 0.289583
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.205241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.288220:  43%|####2     | 3/7 [00:58<01:16, 19.22s/it][I 2025-12-27 01:27:04,995] Trial 2 finished with value: 0.2882235055123501 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.28822048850714405.
feature_fraction, val_score: 0.288220:  43%|####2     | 3/7 [00:58<01:16, 19.22s/it]

Early stopping, best iteration is:
[70]	valid_0's binary_logloss: 0.134533	valid_1's binary_logloss: 0.288224
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.196964 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.287443:  57%|#####7    | 4/7 [01:17<00:58, 19.38s/it][I 2025-12-27 01:27:24,643] Trial 3 finished with value: 0.2874434867420682 and parameters: {'feature_fraction': 0.8}. Best is trial 3 with value: 0.2874434867420682.
feature_fraction, val_score: 0.287443:  57%|#####7    | 4/7 [01:17<00:58, 19.38s/it]

Early stopping, best iteration is:
[84]	valid_0's binary_logloss: 0.130317	valid_1's binary_logloss: 0.287443
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.191931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.287443:  71%|#######1  | 5/7 [01:35<00:37, 18.75s/it][I 2025-12-27 01:27:42,253] Trial 4 finished with value: 0.28876709951369856 and parameters: {'feature_fraction': 0.7}. Best is trial 3 with value: 0.2874434867420682.
feature_fraction, val_score: 0.287443:  71%|#######1  | 5/7 [01:35<00:37, 18.75s/it]

Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.143205	valid_1's binary_logloss: 0.288767
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.287443:  86%|########5 | 6/7 [01:55<00:19, 19.08s/it][I 2025-12-27 01:28:01,983] Trial 5 finished with value: 0.28814259725684077 and parameters: {'feature_fraction': 1.0}. Best is trial 3 with value: 0.2874434867420682.
feature_fraction, val_score: 0.287443:  86%|########5 | 6/7 [01:55<00:19, 19.08s/it]

Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.132587	valid_1's binary_logloss: 0.288143
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.180446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286787


feature_fraction, val_score: 0.286536: 100%|##########| 7/7 [02:14<00:00, 19.04s/it][I 2025-12-27 01:28:20,936] Trial 6 finished with value: 0.2865361111344026 and parameters: {'feature_fraction': 0.4}. Best is trial 6 with value: 0.2865361111344026.
feature_fraction, val_score: 0.286536: 100%|##########| 7/7 [02:14<00:00, 19.16s/it]


Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286536


num_leaves, val_score: 0.286536:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.0674387	valid_1's binary_logloss: 0.291936


num_leaves, val_score: 0.286536:   5%|5         | 1/20 [00:23<07:18, 23.09s/it][I 2025-12-27 01:28:44,077] Trial 7 finished with value: 0.2918310687164613 and parameters: {'num_leaves': 140}. Best is trial 7 with value: 0.2918310687164613.
num_leaves, val_score: 0.286536:   5%|5         | 1/20 [00:23<07:18, 23.09s/it]

Early stopping, best iteration is:
[94]	valid_0's binary_logloss: 0.0707538	valid_1's binary_logloss: 0.291831
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.181616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.0997267	valid_1's binary_logloss: 0.290662


num_leaves, val_score: 0.286536:  10%|#         | 2/20 [00:42<06:15, 20.89s/it][I 2025-12-27 01:29:03,407] Trial 8 finished with value: 0.2905781656735913 and parameters: {'num_leaves': 72}. Best is trial 8 with value: 0.2905781656735913.
num_leaves, val_score: 0.286536:  10%|#         | 2/20 [00:42<06:15, 20.89s/it]

Early stopping, best iteration is:
[106]	valid_0's binary_logloss: 0.0970037	valid_1's binary_logloss: 0.290578
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.179677 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  15%|#5        | 3/20 [01:00<05:35, 19.75s/it][I 2025-12-27 01:29:21,807] Trial 9 finished with value: 0.29267044171217066 and parameters: {'num_leaves': 110}. Best is trial 8 with value: 0.2905781656735913.
num_leaves, val_score: 0.286536:  15%|#5        | 3/20 [01:00<05:35, 19.75s/it]

Early stopping, best iteration is:
[67]	valid_0's binary_logloss: 0.102051	valid_1's binary_logloss: 0.29267
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  20%|##        | 4/20 [01:23<05:35, 20.99s/it][I 2025-12-27 01:29:44,723] Trial 10 finished with value: 0.3002227648089663 and parameters: {'num_leaves': 251}. Best is trial 8 with value: 0.2905781656735913.
num_leaves, val_score: 0.286536:  20%|##        | 4/20 [01:23<05:35, 20.99s/it]

Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.05297	valid_1's binary_logloss: 0.300223
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  25%|##5       | 5/20 [01:45<05:19, 21.27s/it][I 2025-12-27 01:30:06,465] Trial 11 finished with value: 0.29789754565555254 and parameters: {'num_leaves': 254}. Best is trial 8 with value: 0.2905781656735913.
num_leaves, val_score: 0.286536:  25%|##5       | 5/20 [01:45<05:19, 21.27s/it]

Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.0641185	valid_1's binary_logloss: 0.297898
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021777 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.260245	valid_1's binary_logloss: 0.377938
[200]	valid_0's binary_logloss: 0.252895	valid_1's binary_logloss: 0.372058
[300]	valid_0's binary_logloss: 0.249941	valid_1's binary_logloss: 0.369272
[400]	valid_0's binary_logloss: 0.24

num_leaves, val_score: 0.286536:  30%|###       | 6/20 [02:24<06:24, 27.46s/it][I 2025-12-27 01:30:45,950] Trial 12 finished with value: 0.3653142953483841 and parameters: {'num_leaves': 2}. Best is trial 8 with value: 0.2905781656735913.
num_leaves, val_score: 0.286536:  30%|###       | 6/20 [02:25<06:24, 27.46s/it]

Early stopping, best iteration is:
[601]	valid_0's binary_logloss: 0.245918	valid_1's binary_logloss: 0.365314
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017381 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  35%|###5      | 7/20 [02:47<05:35, 25.78s/it][I 2025-12-27 01:31:08,265] Trial 13 finished with value: 0.29544814128671626 and parameters: {'num_leaves': 181}. Best is trial 8 with value: 0.2905781656735913.
num_leaves, val_score: 0.286536:  35%|###5      | 7/20 [02:47<05:35, 25.78s/it]

Early stopping, best iteration is:
[81]	valid_0's binary_logloss: 0.0660403	valid_1's binary_logloss: 0.295448
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.15551	valid_1's binary_logloss: 0.289197


num_leaves, val_score: 0.286536:  40%|####      | 8/20 [03:11<05:02, 25.24s/it][I 2025-12-27 01:31:32,339] Trial 14 finished with value: 0.28721946524948483 and parameters: {'num_leaves': 8}. Best is trial 14 with value: 0.28721946524948483.
num_leaves, val_score: 0.286536:  40%|####      | 8/20 [03:11<05:02, 25.24s/it]

[200]	valid_0's binary_logloss: 0.146347	valid_1's binary_logloss: 0.287251
Early stopping, best iteration is:
[191]	valid_0's binary_logloss: 0.146971	valid_1's binary_logloss: 0.287219
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018512 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.154145	valid_1's binary_logloss: 0.288828


num_leaves, val_score: 0.286536:  45%|####5     | 9/20 [03:34<04:30, 24.61s/it][I 2025-12-27 01:31:55,588] Trial 15 finished with value: 0.2871673909394905 and parameters: {'num_leaves': 9}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  45%|####5     | 9/20 [03:34<04:30, 24.61s/it]

Early stopping, best iteration is:
[170]	valid_0's binary_logloss: 0.146843	valid_1's binary_logloss: 0.287167
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  50%|#####     | 10/20 [03:55<03:55, 23.58s/it][I 2025-12-27 01:32:16,854] Trial 16 finished with value: 0.2899561599944775 and parameters: {'num_leaves': 59}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  50%|#####     | 10/20 [03:55<03:55, 23.58s/it]

[100]	valid_0's binary_logloss: 0.1082	valid_1's binary_logloss: 0.292635
Early stopping, best iteration is:
[90]	valid_0's binary_logloss: 0.113281	valid_1's binary_logloss: 0.289956
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.187857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  55%|#####5    | 11/20 [04:13<03:15, 21.77s/it][I 2025-12-27 01:32:34,502] Trial 17 finished with value: 0.29130615430760254 and parameters: {'num_leaves': 55}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  55%|#####5    | 11/20 [04:13<03:15, 21.77s/it]

Early stopping, best iteration is:
[70]	valid_0's binary_logloss: 0.126687	valid_1's binary_logloss: 0.291306
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  60%|######    | 12/20 [04:35<02:55, 21.96s/it][I 2025-12-27 01:32:56,898] Trial 18 finished with value: 0.2958820742950891 and parameters: {'num_leaves': 188}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  60%|######    | 12/20 [04:35<02:55, 21.96s/it]

Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.0674175	valid_1's binary_logloss: 0.295882
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  65%|######5   | 13/20 [04:54<02:25, 20.79s/it][I 2025-12-27 01:33:15,011] Trial 19 finished with value: 0.29246716181277294 and parameters: {'num_leaves': 126}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  65%|######5   | 13/20 [04:54<02:25, 20.79s/it]

Early stopping, best iteration is:
[66]	valid_0's binary_logloss: 0.0964828	valid_1's binary_logloss: 0.292467
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.185827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.129863	valid_1's binary_logloss: 0.288449
Early stopping, best iteration is:
[96]	valid_0's binary_logloss: 0.13093	valid_1's binary_logloss: 0.288348


num_leaves, val_score: 0.286536:  70%|#######   | 14/20 [05:12<01:59, 19.94s/it][I 2025-12-27 01:33:32,988] Trial 20 finished with value: 0.28834848838518107 and parameters: {'num_leaves': 32}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  70%|#######   | 14/20 [05:12<01:59, 19.94s/it]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.1807	valid_1's binary_logloss: 0.308116
[200]	valid_0's binary_logloss: 0.168755	valid_1's binary_logloss: 0.300182
[300]	valid_0's binary_logloss: 0.164814	valid_1's binary_logloss: 0.297022


num_leaves, val_score: 0.286536:  75%|#######5  | 15/20 [05:42<01:55, 23.03s/it][I 2025-12-27 01:34:03,174] Trial 21 finished with value: 0.29639414946876796 and parameters: {'num_leaves': 3}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  75%|#######5  | 15/20 [05:42<01:55, 23.03s/it]

Early stopping, best iteration is:
[346]	valid_0's binary_logloss: 0.163622	valid_1's binary_logloss: 0.296394
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  80%|########  | 16/20 [06:01<01:27, 21.99s/it][I 2025-12-27 01:34:22,741] Trial 22 finished with value: 0.29308187674455494 and parameters: {'num_leaves': 89}. Best is trial 15 with value: 0.2871673909394905.
num_leaves, val_score: 0.286536:  80%|########  | 16/20 [06:01<01:27, 21.99s/it]

Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.112157	valid_1's binary_logloss: 0.293082
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  85%|########5 | 17/20 [06:21<01:04, 21.45s/it][I 2025-12-27 01:34:42,927] Trial 23 finished with value: 0.28716591944778347 and parameters: {'num_leaves': 25}. Best is trial 23 with value: 0.28716591944778347.
num_leaves, val_score: 0.286536:  85%|########5 | 17/20 [06:21<01:04, 21.45s/it]

[100]	valid_0's binary_logloss: 0.136415	valid_1's binary_logloss: 0.289103
Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.138612	valid_1's binary_logloss: 0.287166
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.189909 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536:  90%|######### | 18/20 [06:39<00:40, 20.29s/it][I 2025-12-27 01:35:00,517] Trial 24 finished with value: 0.28843052216649245 and parameters: {'num_leaves': 30}. Best is trial 23 with value: 0.28716591944778347.
num_leaves, val_score: 0.286536:  90%|######### | 18/20 [06:39<00:40, 20.29s/it]

Early stopping, best iteration is:
[89]	valid_0's binary_logloss: 0.135531	valid_1's binary_logloss: 0.288431
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.128067	valid_1's binary_logloss: 0.288449


num_leaves, val_score: 0.286536:  95%|#########5| 19/20 [07:00<00:20, 20.42s/it][I 2025-12-27 01:35:21,242] Trial 25 finished with value: 0.2883464677540201 and parameters: {'num_leaves': 34}. Best is trial 23 with value: 0.28716591944778347.
num_leaves, val_score: 0.286536:  95%|#########5| 19/20 [07:00<00:20, 20.42s/it]

Early stopping, best iteration is:
[107]	valid_0's binary_logloss: 0.125904	valid_1's binary_logloss: 0.288346
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.183750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.286536: 100%|##########| 20/20 [07:18<00:00, 19.75s/it][I 2025-12-27 01:35:39,433] Trial 26 finished with value: 0.2929732419050813 and parameters: {'num_leaves': 85}. Best is trial 23 with value: 0.28716591944778347.
num_leaves, val_score: 0.286536: 100%|##########| 20/20 [07:18<00:00, 21.92s/it]


Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.105642	valid_1's binary_logloss: 0.292973


bagging, val_score: 0.286536:   0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.286536:  10%|#         | 1/10 [00:19<02:53, 19.29s/it]

[100]	valid_0's binary_logloss: 0.131252	valid_1's binary_logloss: 0.29054
Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.133478	valid_1's binary_logloss: 0.287797


[I 2025-12-27 01:35:58,762] Trial 27 finished with value: 0.2877973846322416 and parameters: {'bagging_fraction': 0.509934059998701, 'bagging_freq': 2}. Best is trial 27 with value: 0.2877973846322416.
bagging, val_score: 0.286536:  10%|#         | 1/10 [00:19<02:53, 19.29s/it]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.286536:  20%|##        | 2/10 [00:36<02:25, 18.14s/it][I 2025-12-27 01:36:16,097] Trial 28 finished with value: 0.28980306753287305 and parameters: {'bagging_fraction': 0.995516796793986, 'bagging_freq': 7}. Best is trial 27 with value: 0.2877973846322416.
bagging, val_score: 0.286536:  20%|##        | 2/10 [00:36<02:25, 18.14s/it]

Early stopping, best iteration is:
[70]	valid_0's binary_logloss: 0.142695	valid_1's binary_logloss: 0.289803
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130662	valid_1's binary_logloss: 0.287552


bagging, val_score: 0.286536:  30%|###       | 3/10 [00:57<02:15, 19.32s/it][I 2025-12-27 01:36:36,825] Trial 29 finished with value: 0.28743200583109313 and parameters: {'bagging_fraction': 0.8730732883159426, 'bagging_freq': 6}. Best is trial 29 with value: 0.28743200583109313.
bagging, val_score: 0.286536:  30%|###       | 3/10 [00:57<02:15, 19.32s/it]

Early stopping, best iteration is:
[105]	valid_0's binary_logloss: 0.129127	valid_1's binary_logloss: 0.287432
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.178444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.286536:  40%|####      | 4/10 [01:14<01:50, 18.38s/it][I 2025-12-27 01:36:53,741] Trial 30 finished with value: 0.28715287133981865 and parameters: {'bagging_fraction': 0.43883386690550286, 'bagging_freq': 1}. Best is trial 30 with value: 0.28715287133981865.
bagging, val_score: 0.286536:  40%|####      | 4/10 [01:14<01:50, 18.38s/it]

[100]	valid_0's binary_logloss: 0.130675	valid_1's binary_logloss: 0.289152
Early stopping, best iteration is:
[96]	valid_0's binary_logloss: 0.131697	valid_1's binary_logloss: 0.287153
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130095	valid_1's binary_logloss: 0.287813


bagging, val_score: 0.286536:  50%|#####     | 5/10 [01:34<01:34, 18.87s/it][I 2025-12-27 01:37:13,506] Trial 31 finished with value: 0.28778942868624074 and parameters: {'bagging_fraction': 0.4090027183840687, 'bagging_freq': 1}. Best is trial 30 with value: 0.28715287133981865.
bagging, val_score: 0.286536:  50%|#####     | 5/10 [01:34<01:34, 18.87s/it]

Early stopping, best iteration is:
[101]	valid_0's binary_logloss: 0.129808	valid_1's binary_logloss: 0.287789
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.176521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.286536:  60%|######    | 6/10 [01:51<01:13, 18.33s/it][I 2025-12-27 01:37:30,799] Trial 32 finished with value: 0.28893593190902794 and parameters: {'bagging_fraction': 0.6633158926117646, 'bagging_freq': 4}. Best is trial 30 with value: 0.28715287133981865.
bagging, val_score: 0.286536:  60%|######    | 6/10 [01:51<01:13, 18.33s/it]

[100]	valid_0's binary_logloss: 0.130867	valid_1's binary_logloss: 0.295273
Early stopping, best iteration is:
[91]	valid_0's binary_logloss: 0.133818	valid_1's binary_logloss: 0.288936
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.286536:  70%|#######   | 7/10 [02:10<00:55, 18.63s/it][I 2025-12-27 01:37:50,043] Trial 33 finished with value: 0.28861373835359855 and parameters: {'bagging_fraction': 0.6284215432565106, 'bagging_freq': 3}. Best is trial 30 with value: 0.28715287133981865.
bagging, val_score: 0.286536:  70%|#######   | 7/10 [02:10<00:55, 18.63s/it]

Early stopping, best iteration is:
[79]	valid_0's binary_logloss: 0.138316	valid_1's binary_logloss: 0.288614
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.188050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.286536:  80%|########  | 8/10 [02:27<00:36, 18.15s/it][I 2025-12-27 01:38:07,165] Trial 34 finished with value: 0.2875549725423499 and parameters: {'bagging_fraction': 0.4195069157329794, 'bagging_freq': 1}. Best is trial 30 with value: 0.28715287133981865.
bagging, val_score: 0.286536:  80%|########  | 8/10 [02:27<00:36, 18.15s/it]

[100]	valid_0's binary_logloss: 0.130587	valid_1's binary_logloss: 0.288456
Early stopping, best iteration is:
[94]	valid_0's binary_logloss: 0.132435	valid_1's binary_logloss: 0.287555
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130822	valid_1's binary_logloss: 0.28847


bagging, val_score: 0.286536:  90%|######### | 9/10 [02:48<00:19, 19.08s/it][I 2025-12-27 01:38:28,295] Trial 35 finished with value: 0.2882246085873174 and parameters: {'bagging_fraction': 0.7615508416103894, 'bagging_freq': 5}. Best is trial 30 with value: 0.28715287133981865.
bagging, val_score: 0.286536:  90%|######### | 9/10 [02:48<00:19, 19.08s/it]

Early stopping, best iteration is:
[114]	valid_0's binary_logloss: 0.126927	valid_1's binary_logloss: 0.288225
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.286536: 100%|##########| 10/10 [03:08<00:00, 19.14s/it][I 2025-12-27 01:38:47,576] Trial 36 finished with value: 0.2886736983539845 and parameters: {'bagging_fraction': 0.5526463176795604, 'bagging_freq': 3}. Best is trial 30 with value: 0.28715287133981865.
bagging, val_score: 0.286536: 100%|##########| 10/10 [03:08<00:00, 18.81s/it]


Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.13799	valid_1's binary_logloss: 0.288674


feature_fraction_stage2, val_score: 0.286536:   0%|          | 0/3 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction_stage2, val_score: 0.286536:  33%|###3      | 1/3 [00:19<00:38, 19.21s/it][I 2025-12-27 01:39:06,827] Trial 37 finished with value: 0.28926112639045026 and parameters: {'feature_fraction': 0.44800000000000006}. Best is trial 37 with value: 0.28926112639045026.
feature_fraction_stage2, val_score: 0.286536:  33%|###3      | 1/3 [00:19<00:38, 19.21s/it]

Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.140102	valid_1's binary_logloss: 0.289261
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction_stage2, val_score: 0.286536:  67%|######6   | 2/3 [00:40<00:20, 20.19s/it][I 2025-12-27 01:39:27,701] Trial 38 finished with value: 0.28734550650676494 and parameters: {'feature_fraction': 0.48000000000000004}. Best is trial 38 with value: 0.28734550650676494.
feature_fraction_stage2, val_score: 0.286536:  67%|######6   | 2/3 [00:40<00:20, 20.19s/it]

[100]	valid_0's binary_logloss: 0.129518	valid_1's binary_logloss: 0.287428
Early stopping, best iteration is:
[90]	valid_0's binary_logloss: 0.132595	valid_1's binary_logloss: 0.287346
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


feature_fraction_stage2, val_score: 0.286536: 100%|##########| 3/3 [01:00<00:00, 20.18s/it][I 2025-12-27 01:39:47,879] Trial 39 finished with value: 0.28723129224219257 and parameters: {'feature_fraction': 0.41600000000000004}. Best is trial 39 with value: 0.28723129224219257.
feature_fraction_stage2, val_score: 0.286536: 100%|##########| 3/3 [01:00<00:00, 20.10s/it]


[100]	valid_0's binary_logloss: 0.130268	valid_1's binary_logloss: 0.288859
Early stopping, best iteration is:
[91]	valid_0's binary_logloss: 0.133131	valid_1's binary_logloss: 0.287231


regularization_factors, val_score: 0.286536:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286514:   5%|5         | 1/20 [00:21<06:52, 21.69s/it][I 2025-12-27 01:40:09,629] Trial 40 finished with value: 0.28651393615061904 and parameters: {'lambda_l1': 4.456674336746826e-07, 'lambda_l2': 4.2917482574824355e-07}. Best is trial 40 with value: 0.28651393615061904.
regularization_factors, val_score: 0.286514:   5%|5         | 1/20 [00:21<06:52, 21.69s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286514
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.180981 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286514:  10%|#         | 2/20 [00:40<06:01, 20.08s/it][I 2025-12-27 01:40:28,566] Trial 41 finished with value: 0.2865139362067682 and parameters: {'lambda_l1': 1.1183531548969373e-07, 'lambda_l2': 1.8251521766817335e-07}. Best is trial 40 with value: 0.28651393615061904.
regularization_factors, val_score: 0.286514:  10%|#         | 2/20 [00:40<06:01, 20.08s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286514
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.179790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286514:  15%|#5        | 3/20 [00:59<05:34, 19.67s/it][I 2025-12-27 01:40:47,733] Trial 42 finished with value: 0.28651393620927557 and parameters: {'lambda_l1': 1.455032697305869e-07, 'lambda_l2': 1.6247733464520895e-07}. Best is trial 40 with value: 0.28651393615061904.
regularization_factors, val_score: 0.286514:  15%|#5        | 3/20 [00:59<05:34, 19.67s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286514
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286788


regularization_factors, val_score: 0.286514:  20%|##        | 4/20 [01:21<05:26, 20.43s/it][I 2025-12-27 01:41:09,341] Trial 43 finished with value: 0.2865365912633633 and parameters: {'lambda_l1': 8.757761439044632e-08, 'lambda_l2': 1.1068300320940581e-07}. Best is trial 40 with value: 0.28651393615061904.
regularization_factors, val_score: 0.286514:  20%|##        | 4/20 [01:21<05:26, 20.43s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286537
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286514:  25%|##5       | 5/20 [01:43<05:13, 20.91s/it][I 2025-12-27 01:41:31,103] Trial 44 finished with value: 0.28652651567297077 and parameters: {'lambda_l1': 1.6699915501672861e-07, 'lambda_l2': 1.2562409992184724e-07}. Best is trial 40 with value: 0.28651393615061904.
regularization_factors, val_score: 0.286514:  25%|##5       | 5/20 [01:43<05:13, 20.91s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286527
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286764


regularization_factors, val_score: 0.286513:  30%|###       | 6/20 [02:04<04:56, 21.17s/it][I 2025-12-27 01:41:52,781] Trial 45 finished with value: 0.2865126201728413 and parameters: {'lambda_l1': 1.6235743568735296e-07, 'lambda_l2': 1.5132360519323943e-07}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  30%|###       | 6/20 [02:04<04:56, 21.17s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286513
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.187200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286787


regularization_factors, val_score: 0.286513:  35%|###5      | 7/20 [02:23<04:25, 20.38s/it][I 2025-12-27 01:42:11,542] Trial 46 finished with value: 0.2865361111090838 and parameters: {'lambda_l1': 1.3891096946467157e-07, 'lambda_l2': 1.326063411135187e-07}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  35%|###5      | 7/20 [02:23<04:25, 20.38s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286536
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.179796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286789


regularization_factors, val_score: 0.286513:  40%|####      | 8/20 [02:42<03:58, 19.91s/it][I 2025-12-27 01:42:30,464] Trial 47 finished with value: 0.28653742713723146 and parameters: {'lambda_l1': 1.2870120549530697e-07, 'lambda_l2': 1.8648812432783536e-07}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  40%|####      | 8/20 [02:42<03:58, 19.91s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286537
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286513:  45%|####5     | 9/20 [03:04<03:45, 20.48s/it][I 2025-12-27 01:42:52,175] Trial 48 finished with value: 0.2865139361795194 and parameters: {'lambda_l1': 1.9730096824669728e-07, 'lambda_l2': 3.1909342626672e-07}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  45%|####5     | 9/20 [03:04<03:45, 20.48s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286514
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286513:  50%|#####     | 10/20 [03:23<03:19, 20.00s/it][I 2025-12-27 01:43:11,077] Trial 49 finished with value: 0.28651393613169135 and parameters: {'lambda_l1': 2.911257759227175e-07, 'lambda_l2': 5.530116149769132e-07}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  50%|#####     | 10/20 [03:23<03:19, 20.00s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286514
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286513:  55%|#####5    | 11/20 [03:44<03:04, 20.51s/it][I 2025-12-27 01:43:32,767] Trial 50 finished with value: 0.28652651107010213 and parameters: {'lambda_l1': 5.298550045380915e-05, 'lambda_l2': 1.6120563199735317e-05}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  55%|#####5    | 11/20 [03:44<03:04, 20.51s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286527
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.205263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286513:  60%|######    | 12/20 [04:03<02:40, 20.09s/it][I 2025-12-27 01:43:51,883] Trial 51 finished with value: 0.28651339595120273 and parameters: {'lambda_l1': 2.54024490029693e-07, 'lambda_l2': 4.93736375477315e-07}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  60%|######    | 12/20 [04:04<02:40, 20.09s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286513
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286513:  65%|######5   | 13/20 [04:25<02:24, 20.60s/it][I 2025-12-27 01:44:13,671] Trial 52 finished with value: 0.28652651510628363 and parameters: {'lambda_l1': 8.757206051958077e-07, 'lambda_l2': 3.1004701789235313e-06}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  65%|######5   | 13/20 [04:25<02:24, 20.60s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286527
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130566	valid_1's binary_logloss: 0.288092
Early stopping, best iteration is:
[93]	valid_0's binary_logloss: 0.132815	valid_1's binary_logloss: 0.287322


regularization_factors, val_score: 0.286513:  70%|#######   | 14/20 [04:46<02:03, 20.55s/it][I 2025-12-27 01:44:34,096] Trial 53 finished with value: 0.28732185241707187 and parameters: {'lambda_l1': 0.8626828130544224, 'lambda_l2': 0.11986694470027486}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  70%|#######   | 14/20 [04:46<02:03, 20.55s/it]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286786


regularization_factors, val_score: 0.286513:  75%|#######5  | 15/20 [05:08<01:44, 20.97s/it][I 2025-12-27 01:44:56,050] Trial 54 finished with value: 0.2865346481130212 and parameters: {'lambda_l1': 2.7141500445891904e-06, 'lambda_l2': 3.4334315365448993e-06}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  75%|#######5  | 15/20 [05:08<01:44, 20.97s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286535
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286513:  80%|########  | 16/20 [05:29<01:24, 21.21s/it][I 2025-12-27 01:45:17,798] Trial 55 finished with value: 0.28652651569638093 and parameters: {'lambda_l1': 2.254782153135192e-08, 'lambda_l2': 1.0331878317039147e-08}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  80%|########  | 16/20 [05:29<01:24, 21.21s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286527
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.179895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286513:  85%|########5 | 17/20 [05:48<01:01, 20.47s/it][I 2025-12-27 01:45:36,551] Trial 56 finished with value: 0.2865133954448064 and parameters: {'lambda_l1': 5.772328659072209e-06, 'lambda_l2': 2.3266153032249317e-06}. Best is trial 45 with value: 0.2865126201728413.
regularization_factors, val_score: 0.286513:  85%|########5 | 17/20 [05:48<01:01, 20.47s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286513
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.28676


regularization_factors, val_score: 0.286510:  90%|######### | 18/20 [06:10<00:41, 20.87s/it][I 2025-12-27 01:45:58,341] Trial 57 finished with value: 0.2865098405194251 and parameters: {'lambda_l1': 5.7934257175117564e-06, 'lambda_l2': 6.434980996929208e-06}. Best is trial 57 with value: 0.2865098405194251.
regularization_factors, val_score: 0.286510:  90%|######### | 18/20 [06:10<00:41, 20.87s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.28651
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130635	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286510:  95%|#########5| 19/20 [06:32<00:21, 21.17s/it][I 2025-12-27 01:46:20,232] Trial 58 finished with value: 0.28651392935049785 and parameters: {'lambda_l1': 3.7615556489660645e-06, 'lambda_l2': 3.817483010622526e-05}. Best is trial 57 with value: 0.2865098405194251.
regularization_factors, val_score: 0.286510:  95%|#########5| 19/20 [06:32<00:21, 21.17s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286514
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.130636	valid_1's binary_logloss: 0.286765


regularization_factors, val_score: 0.286510: 100%|##########| 20/20 [06:54<00:00, 21.44s/it][I 2025-12-27 01:46:42,300] Trial 59 finished with value: 0.28652649658034224 and parameters: {'lambda_l1': 1.2159699565668808e-05, 'lambda_l2': 0.00010483112517554437}. Best is trial 57 with value: 0.2865098405194251.
regularization_factors, val_score: 0.286510: 100%|##########| 20/20 [06:54<00:00, 20.72s/it]


Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125722	valid_1's binary_logloss: 0.286526


min_child_samples, val_score: 0.286510:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


min_child_samples, val_score: 0.286510:  20%|##        | 1/5 [00:20<01:20, 20.04s/it][I 2025-12-27 01:47:02,376] Trial 60 finished with value: 0.2884761876268841 and parameters: {'min_child_samples': 5}. Best is trial 60 with value: 0.2884761876268841.
min_child_samples, val_score: 0.286510:  20%|##        | 1/5 [00:20<01:20, 20.04s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.135917	valid_1's binary_logloss: 0.288476
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.131161	valid_1's binary_logloss: 0.287566


min_child_samples, val_score: 0.286510:  40%|####      | 2/5 [00:38<00:57, 19.17s/it][I 2025-12-27 01:47:20,942] Trial 61 finished with value: 0.2875371236711693 and parameters: {'min_child_samples': 50}. Best is trial 61 with value: 0.2875371236711693.
min_child_samples, val_score: 0.286510:  40%|####      | 2/5 [00:38<00:57, 19.17s/it]

Early stopping, best iteration is:
[108]	valid_0's binary_logloss: 0.128826	valid_1's binary_logloss: 0.287537
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.178893 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


min_child_samples, val_score: 0.286510:  60%|######    | 3/5 [00:56<00:37, 18.65s/it][I 2025-12-27 01:47:38,980] Trial 62 finished with value: 0.2881727911294642 and parameters: {'min_child_samples': 25}. Best is trial 61 with value: 0.2875371236711693.
min_child_samples, val_score: 0.286510:  60%|######    | 3/5 [00:56<00:37, 18.65s/it]

[100]	valid_0's binary_logloss: 0.13119	valid_1's binary_logloss: 0.295463
Early stopping, best iteration is:
[93]	valid_0's binary_logloss: 0.132956	valid_1's binary_logloss: 0.288173
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.181386 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds


min_child_samples, val_score: 0.286510:  80%|########  | 4/5 [01:14<00:18, 18.38s/it][I 2025-12-27 01:47:56,934] Trial 63 finished with value: 0.28836937987525724 and parameters: {'min_child_samples': 10}. Best is trial 61 with value: 0.2875371236711693.
min_child_samples, val_score: 0.286510:  80%|########  | 4/5 [01:14<00:18, 18.38s/it]

Early stopping, best iteration is:
[88]	valid_0's binary_logloss: 0.135335	valid_1's binary_logloss: 0.288369
[LightGBM] [Info] Number of positive: 234829, number of negative: 897581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.180820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37717
[LightGBM] [Info] Number of data points in the train set: 1132410, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207371 -> initscore=-1.340846
[LightGBM] [Info] Start training from score -1.340846
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.13062	valid_1's binary_logloss: 0.286814


min_child_samples, val_score: 0.286510: 100%|##########| 5/5 [01:33<00:00, 18.42s/it][I 2025-12-27 01:48:15,421] Trial 64 finished with value: 0.2867418184367646 and parameters: {'min_child_samples': 100}. Best is trial 64 with value: 0.2867418184367646.
min_child_samples, val_score: 0.286510: 100%|##########| 5/5 [01:33<00:00, 18.62s/it]

Early stopping, best iteration is:
[105]	valid_0's binary_logloss: 0.129069	valid_1's binary_logloss: 0.286742





[LightGBM] [Info] Number of positive: 335599, number of negative: 1282130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37917
[LightGBM] [Info] Number of data points in the train set: 1617729, number of used features: 226
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207451 -> initscore=-1.340361
[LightGBM] [Info] Start training from score -1.340361
AUC: 0.968(train), 0.742(test)


In [30]:
#特徴量の重要度
keiba_ai.feature_importance()

Unnamed: 0,features,importance
182,jockey_plc_rate_10_all,142
184,jockey_plc_rate_50_all,123
119,breeder_id,104
188,age_days,96
9,n_horses,70
12,賞金_5R,57
7,体重,50
2,horse_id,47
84,賞金_allR,44
29,着順_race_type_5R,43


In [31]:
#ハイパーパラメータの確認
keiba_ai.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'feature_pre_filter': False,
 'lambda_l1': 5.7934257175117564e-06,
 'lambda_l2': 6.434980996929208e-06,
 'feature_fraction': 0.4,
 'bagging_fraction': 1.0,
 'bagging_freq': 0}

In [None]:
#チューニングしないで学習
#keiba_ai.train_without_tuning()

In [32]:
#モデル保存。models/(実行した日付)/(version_name).pickleに、モデルとデータセットが保存される。
training.KeibaAIFactory.save(keiba_ai, version_name='basemodel_2020_2025')

In [33]:
#モデルロード
keiba_ai = training.KeibaAIFactory.load('models/20251227/basemodel_2020_2025.pickle')
keiba_ai.set_params(keiba_ai.get_params())

# 5. シミュレーション

In [None]:
# 5章（シミュレーション）だけを実行したいときの軽量セットアップ用セル

# カーネル再起動後に 3章・4章を再実行せず、

# 学習済みモデル（UMABAN を特徴量から除外して再学習したもの）と払戻テーブルだけを準備する。



from modules.constants import LocalPaths

from modules import preprocessing, training



# 払戻テーブルの前処理のみ実行（Simulator が参照）

return_processor = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)

In [None]:
%autoreload

In [None]:
#シミュレーターに馬券をセット
simulator = simulation.Simulator(return_processor)

In [None]:
# スコアテーブルを取得
score_table = keiba_ai.calc_score(keiba_ai.datasets.X_test, policies.StdScorePolicy)

## 5.1. 単一threshold

### 5.1.1 単勝馬券

In [None]:
fi = keiba_ai.feature_importance(num_features=300)
fi_j = fi[fi['features'].str.startswith('jockey_')]
fi_j.head(20)


In [None]:
# セルを新規で作って実行
feat = feature_enginnering.featured_data
[c for c in feat.columns if c.startswith('jockey_')], feat.filter(like='jockey_').head()

In [None]:
# 単勝シミュレーション: T_RANGE と actions サイズ、騎手特徴量の確認用デバッグ

T_RANGE = [0.0, 3.5]
print('T_RANGE =', T_RANGE)

print('score_table columns (head):', score_table.columns[:10].tolist())

print('\nscore_table[jockey関連列] の例:')

j_cols = [c for c in score_table.columns if c.startswith('jockey_')]

print(j_cols[:10])

if j_cols:

    display(score_table[j_cols].describe())



print('\nscore 分布:')

display(score_table['score'].describe())



print('\nthreshold=0.0 での actions 件数:')

from modules.policies import BetPolicyTansho

actions_debug = keiba_ai.decide_action(score_table, BetPolicyTansho, threshold=0.0)

print('len(actions_debug) =', len(actions_debug))

In [None]:
import traceback

T_RANGE = [0.0, 3.5]
N_SAMPLES = 100
returns = {}

# スコアテーブルを一度だけ計算しておく
score_table = keiba_ai.calc_score(keiba_ai.datasets.X_test, policies.StdScorePolicy)

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGE の範囲を N_SAMPLES 個に分割し、0.0〜3.5 を両端含めてスイープ
    if N_SAMPLES > 1:
        threshold = T_RANGE[0] + (T_RANGE[1] - T_RANGE[0]) * i / (N_SAMPLES - 1)
    else:
        threshold = T_RANGE[0]
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
            score_table,              # スコアテーブル
            policies.BetPolicyTansho, # 賭け方の方針
            threshold=threshold       # 「馬の勝ちやすさスコア」の閾値
        )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception:
        traceback.print_exc()
        break

returns_df = pd.DataFrame.from_dict(returns, orient='index').sort_index()
returns_df.index.name = 'threshold'

In [None]:
#シミュレーション結果も、models/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20251226/tansho.pickle')

In [None]:
#回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='tansho')

In [None]:
import matplotlib.pyplot as plt

def plot_single_threshold_compare(old_returns_df, returns_df, N_SAMPLES, label1='old_tansho', label2='new_tansho'):
    plt.figure(dpi=100)
    # old_returns_dfの標準偏差で幅をつけて薄くプロット
    plt.fill_between(
        old_returns_df.index,
        y1=old_returns_df['return_rate']-old_returns_df['std'],
        y2=old_returns_df['return_rate']+old_returns_df['std'],
        alpha=0.3
        )
    # old_returns_dfの回収率を実線でプロット
    plt.plot(old_returns_df.index, old_returns_df['return_rate'], label=label1)

    # returns_dfの標準偏差で幅をつけて薄くプロット
    plt.fill_between(
        returns_df.index,
        y1=returns_df['return_rate']-returns_df['std'],
        y2=returns_df['return_rate']+returns_df['std'],
        alpha=0.3
        )
    # returns_dfの回収率を実線でプロット
    plt.plot(returns_df.index, returns_df['return_rate'], label=label2)

    # labelで設定した凡例を表示させる
    plt.legend()
    # グリッドをつける
    plt.grid(True)
    plt.xlabel('threshold')
    plt.ylabel('return_rate')
    plt.show()

In [None]:
old_returns_df = pd.read_pickle('models/20251223/tansho.pickle')

#old_returns_dfとreturns_dfの結果を重ねてプロットして比較
plot_single_threshold_compare(
    old_returns_df, returns_df, N_SAMPLES,
    label1='old_tansho', label2='new_tansho'
)



In [None]:
# score_table['score'].describe() の出力とreturns_df.index.min()/max() と len(returns_df)を貼るコード
print("score_table['score'] の統計情報:")
display(score_table['score'].describe())
print(f"returns_df index min: {returns_df.index.min()}")
print(f"returns_df index max: {returns_df.index.max()}")
print(f"returns_df length: {len(returns_df)}")

### 5.1.2 複勝馬券

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyFukusho, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20251226/fukusho.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='fukusho')

### 5.1.3 馬連BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyUmarenBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20251226/umarenbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='umarenbox')

### 5.1.4 馬単BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyUmatanBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20251226/umatanbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='umatanbox')

### 5.1.5 ワイドBOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyWideBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20251226/widebox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='widebox')

### 5.1.6 三連複BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicySanrenpukuBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20251226/sanrenpukubox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='sanrenpukubox')

### 5.1.7 三連単BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicySanrentanBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20251226/sanrentanbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='sanrentanbox')

## 5.2. 複数馬券
未実装

## 5.3. 複数threshold
未実装だが、以下のようなコードになる予定。

In [None]:
T1_RANGE = [2.5, 3.5]
MIN_T2 = 1
N_SAMPLES = 10

returns = {}
#「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
idx = 0
for i in tqdm(range(N_SAMPLES)):
    #T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold1 = T1_RANGE[1] * i / N_SAMPLES + T1_RANGE[0] * (1-(i/N_SAMPLES))
    for j in range(N_SAMPLES):
        #MIN_T2からthreshold1までをN_SAMPLES等分
        threshold2 = threshold1 * j / N_SAMPLES + MIN_T2 * (1-(j/N_SAMPLES))
        try:
            #print(threshold1, threshold2)
            #賭ける馬券を決定
            actions = keiba_ai.decide_action(
                    score_table, # スコアテーブル
                    policies.BetPolicyTanshoFukusho, # 賭け方の方針(未実装)
                    threshold1=threshold1, #「馬の勝ちやすさスコア」の閾値
                    threshold2=threshold2
                    )
            returns[idx] = simulator.calc_returns(actions)
            idx += 1
        except Exception as e:
            print(e)
            break
returns_df = pd.DataFrame.from_dict(returns, orient='index')

In [None]:
simulation.plot_single_threshold(returns_df.reset_index(), 100, label='tansho_fukusho')

# 6. 当日の予想
例として2022年1月8日のレースを実際に予想する場合を考える。  
https://race.netkeiba.com/top/race_list.html?kaisai_date=20220108

## 6.1. 前日準備

In [3]:
%autoreload

In [4]:
race_id_list = preparing.scrape_race_id_list(['20251228']) #レースidを取得
len(race_id_list)

getting race_id_list


  0%|          | 0/1 [00:00<?, ?it/s]

scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20251228


24

In [5]:
#出走するhorse_idの取得
horse_id_list = preparing.scrape_horse_id_list(race_id_list)
len(horse_id_list)

sraping horse_id_list


  0%|          | 0/24 [00:00<?, ?it/s]

356

In [6]:
#horseページのhtmlをスクレイピング
#直近レースが更新されている可能性があるので、skip=Falseにして上書きする
html_files_horse = preparing.scrape_html_horse_with_master(horse_id_list, skip=False)

scraping


  0%|          | 0/356 [00:00<?, ?it/s]

updating master


In [7]:
#horse_infoテーブルの更新
horse_info_20250920 = preparing.get_rawdata_horse_info(html_files_horse)
preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info_20250920)

preparing raw horse_info table


  0%|          | 0/356 [00:00<?, ?it/s]

更新モード: 既存 20820 + 新規 86 = 合計 20906 レコード
データ更新完了: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\raw\horse_info.pickle


Unnamed: 0,生年月日,調教師,馬主,生産者,産地,セリ取引価格,獲得賞金 (中央),獲得賞金 (地方),通算成績,主な勝鞍,近親馬,trainer_id,owner_id,breeder_id,募集情報
2011106610,2011年5月29日,雑賀正光 (高知),岡田牧雄,増本良孝,新ひだか町,-,"7,051万円",114万円,82戦6勝 [6-3-1-72],15'奥尻特別(500万下),バーミーズ、シルバーゲイル,a0043,851009,233302,
2012100683,2012年3月17日,宮川真衣 (高知),レックス,コスモヴューファーム,新冠町,-,2億109万円,229万円,79戦10勝 [10-9-8-52],17'イルミネーションジャンプS(OP),マイネルパッセ、パルクリール,a0457,309803,214514,
2012103532,2012年2月15日,清水久詞 (栗東),サラブレッドクラブ・ラフィアン,真歌田中牧場,新ひだか町,"1,050万円 (2013年 北海道サマーセール)",2億652万円,0万円,66戦8勝 [8-10-6-42],21'ペガサスジャンプS(OP),トータルソッカー、ガリンシャ,01110,546800,733081,1口:36万円/50口
2012104463,2012年3月17日,今野貞一 (栗東),中村政夫,社台ファーム,千歳市,928万円 (2014年 千葉サラブレッド・セール),"2億3,061万円",0万円,59戦9勝 [9-8-5-37],18'福島テレビオープン(OP),デルマトトコ,01128,471033,393126,
2012104504,2012年4月17日,飯田雄三 (栗東),サラブレッドクラブ・ラフィアン,ノーザンファーム,安平町,-,"1億8,539万円",0万円,58戦8勝 [8-8-6-36],19'総武S(OP),ジオフロント、プロレタリアト,01050,546800,373126,1口:22万円/100口
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023103275,2023年3月2日,宮地貴稔 (栗東),近藤英二,三木田牧場,新ひだか町,-,0万円,0万円,0戦0勝 [0-0-0-0],,テイエムタツマキ、ビーハグ,01211,562031,440306,
2023102845,2023年3月15日,斉藤崇史 (栗東),フジイ興産,田中裕之,新ひだか町,"2,310万円 (2024年 北海道セレクションセール)",0万円,0万円,0戦0勝 [0-0-0-0],,トータルクラリティ、エクサビット,01151,851800,333084,
2023102878,2023年3月7日,坂口智康 (栗東),今村明浩,谷岡牧場,新ひだか町,330万円 (2024年 北海道サマーセール),0万円,0万円,0戦0勝 [0-0-0-0],,ナツハヤテ、サハラブレイブ,01170,466033,733085,
2023105679,2023年2月5日,安田翔伍 (栗東),金子真人ホールディングス,白井牧場,日高町,-,0万円,0万円,0戦0勝 [0-0-0-0],,トプシー、フライングレディの2024,01164,708800,033357,


In [8]:
#horse_resultsテーブルの更新
horse_results_20250920 = preparing.get_rawdata_horse_results(html_files_horse)
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_20250920)

preparing raw horse_results table


  0%|          | 0/356 [00:00<?, ?it/s]

horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023103020.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023103778.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023104845.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023105979.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023101699.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023105654.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023103806.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023102171.bin
horse_re

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,...,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
2011106610,2024/04/29,高知,雨,8.0,りさママ生誕記念特別(C3),,10.0,6.0,7,45.4,...,4.7,**,2-4-6-7,,41.9,477(-1),,,イモータルスモーク,
2011106610,2024/04/14,高知,晴,8.0,C3ー15,,11.0,7.0,8,23.7,...,2.3,**,2-2-2-3,,42.2,478(-1),,,カイラシ,
2011106610,2024/03/27,高知,晴,3.0,C3ー10,,12.0,3.0,3,18.1,...,1.1,**,2-2-2-2,,41.6,479(+3),,,デルマジゾウ,6.0
2011106610,2024/03/20,高知,小雨,2.0,C3ー7,,11.0,7.0,9,11.0,...,0.7,**,7-8-5-6,,41.9,476(0),,,ヤマニンバシリーサ,6.0
2011106610,2024/03/06,高知,曇,3.0,伊予馬事畜産特別(C3),,11.0,6.0,7,11.0,...,1.4,**,3-3-3-4,,39.7,476(-2),,,ララプロフォン,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023107317,2025/09/21,4中山7,晴,5.0,2歳新馬,,10.0,8.0,9,3.8,...,0.5,**,4-4-4-4,38.2-34.3,34.4,442(0),,,フロレセール,300.0
2023107045,2025/11/30,5東京8,晴,6.0,2歳新馬,,12.0,1.0,1,6.5,...,0.5,**,7-7-6,36.3-33.6,33.3,446(0),,,ゴーラッキー,110.0
2023103090,2025/11/30,5東京8,晴,5.0,2歳新馬,,16.0,3.0,6,7.0,...,2.8,**,15-15,35.2-39.0,39.3,472(0),,,エジプシャンマウ,
2023101928,2025/11/29,5東京7,晴,4.0,2歳新馬,,16.0,8.0,16,19.0,...,2.3,**,12-12,35.8-37.2,38.2,472(0),,,オクトーバーナイン,


In [9]:
#更新後のhorse_infoテーブルの確認
horse_info_processor = preprocessing.HorseInfoProcessor(filepath=LocalPaths.RAW_HORSE_INFO_PATH)
display(horse_info_processor.raw_data.tail())
len(horse_info_processor.raw_data)

Unnamed: 0,生年月日,調教師,馬主,生産者,産地,セリ取引価格,獲得賞金 (中央),獲得賞金 (地方),通算成績,主な勝鞍,近親馬,trainer_id,owner_id,breeder_id,募集情報
2023103275,2023年3月2日,宮地貴稔 (栗東),近藤英二,三木田牧場,新ひだか町,-,0万円,0万円,0戦0勝 [0-0-0-0],,テイエムタツマキ、ビーハグ,1211,562031,440306,
2023102845,2023年3月15日,斉藤崇史 (栗東),フジイ興産,田中裕之,新ひだか町,"2,310万円 (2024年 北海道セレクションセール)",0万円,0万円,0戦0勝 [0-0-0-0],,トータルクラリティ、エクサビット,1151,851800,333084,
2023102878,2023年3月7日,坂口智康 (栗東),今村明浩,谷岡牧場,新ひだか町,330万円 (2024年 北海道サマーセール),0万円,0万円,0戦0勝 [0-0-0-0],,ナツハヤテ、サハラブレイブ,1170,466033,733085,
2023105679,2023年2月5日,安田翔伍 (栗東),金子真人ホールディングス,白井牧場,日高町,-,0万円,0万円,0戦0勝 [0-0-0-0],,トプシー、フライングレディの2024,1164,708800,33357,
2023105810,2023年3月17日,清水久詞 (栗東),中村祐子,ケイアイファーム,新ひだか町,-,0万円,0万円,0戦0勝 [0-0-0-0],,デュアラブル、ハーディネス,1110,373008,253025,


20906

In [10]:
#更新後のhorse_resultsテーブルの確認
horse_results_processor = preprocessing.HorseResultsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
display(horse_results_processor.raw_data.tail())
len(horse_results_processor.raw_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,...,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
2023107317,2025/09/21,4中山7,晴,5.0,2歳新馬,,10.0,8.0,9,3.8,...,0.5,**,4-4-4-4,38.2-34.3,34.4,442(0),,,フロレセール,300.0
2023107045,2025/11/30,5東京8,晴,6.0,2歳新馬,,12.0,1.0,1,6.5,...,0.5,**,7-7-6,36.3-33.6,33.3,446(0),,,ゴーラッキー,110.0
2023103090,2025/11/30,5東京8,晴,5.0,2歳新馬,,16.0,3.0,6,7.0,...,2.8,**,15-15,35.2-39.0,39.3,472(0),,,エジプシャンマウ,
2023101928,2025/11/29,5東京7,晴,4.0,2歳新馬,,16.0,8.0,16,19.0,...,2.3,**,12-12,35.8-37.2,38.2,472(0),,,オクトーバーナイン,
2023105795,2025/11/29,5東京7,晴,5.0,2歳新馬,,13.0,5.0,6,23.4,...,0.3,**,2-2,36.3-34.0,34.3,436(0),,,ヴァロアーク,190.0


17103796

In [11]:
#pedsテーブルの更新
html_files_peds = preparing.scrape_html_ped(horse_id_list, skip=False)
peds_20250920 = preparing.get_rawdata_peds(html_files_peds)
preparing.update_rawdata(LocalPaths.RAW_PEDS_PATH, peds_20250920)

  0%|          | 0/356 [00:00<?, ?it/s]

preparing raw peds table


  0%|          | 0/356 [00:00<?, ?it/s]

更新モード: 既存 35758 + 新規 69 = 合計 35827 レコード
データ更新完了: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\raw\peds.pickle


Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2011106610,1999100226,000a000082,000a0012cb,000a000f2b,000a001042,000a0078a6,000a007c38,000a000f87,000a007d0b,000a00877c,...,000a008e05,000a000ded,000a008e04,000a00ae88,000a000e32,000a000e03,000a007ccf,000a007259,000a000de4,000a007b7d
2012100683,2003104570,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a0082aa,000a001383,000a007cea,1982105011,000a000081,000a000e03,000a007054,1955101622,000a00026d,000a0031be
2012103532,1994108729,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a009232,000a001bd8,000a009231,1990109129,000a001b87,000a0000d3,000a00909c,000a006409,000a000e0e,000a007ca0
2012104463,2003102205,000a000d77,000a00185d,000a000e04,000a000f8c,000a00702e,000a008892,000a001183,000a0081e9,000a009851,...,000a008eb9,000a000dda,000a0216c4,000a00fa9e,000a0016db,000a000e04,000a00836d,000a00fa9f,000a000db7,000a00faa0
2012104504,1999106689,000a00033a,000a0012bf,000a000f2b,000a001042,000a0078a6,000a007459,000a0013f0,000a0072a0,000a008c1e,...,000a009de9,000a0012be,000a0086f3,000a00a2b0,000a00193e,000a0010e2,000a007e1d,000a00a2af,000a000ded,000a00a2ae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023103275,2011103565,1992109618,000a00033a,000a0012bf,000a000f2b,000a007459,000a008c1e,000a0019b6,000a008c1d,000a00021e,...,000a006409,000a000e0e,000a007ca0,000a000120,000a000e05,000a000e04,000a00703e,000a009713,000a001cfd,000a009010
2023102845,000a01aa3d,000a011701,000a002071,000a001cd0,000a001702,000a00902d,000a009fd9,000a0010ed,000a00945e,000a011724,...,000a00a946,000a00184d,000a00a945,000a00030d,000a0015fc,000a001598,000a008080,000a00b06b,000a001607,000a009dff
2023102878,2013104704,2002100816,000a00033a,000a0012bf,000a000f2b,000a007459,000a008c1e,000a0019b6,000a008c1d,000a0003a2,...,000a008e05,000a000ded,000a008e04,000a010ba1,000a001607,000a000e46,000a007e0c,000a0081ee,000a000e0e,000a007227
2023105679,2010104298,2001103460,000a001d7e,000a001607,000a000e46,000a007e0c,000a009961,000a001676,000a0084c9,000a00680e,...,000a00a946,000a00184d,000a00a945,000a000305,000a0019b4,000a0012cb,000a008c0e,000a008d37,000a000eae,000a008320


In [12]:
#processorの更新
horse_info_processor = preprocessing.HorseInfoProcessor(
    filepath=LocalPaths.RAW_HORSE_INFO_PATH)
horse_results_processor = preprocessing.HorseResultsProcessor(
    filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
peds_processor = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


In [13]:
#モデルの準備
keiba_ai = training.KeibaAIFactory.load('models/20251227/basemodel_2020_2025.pickle')

## 6.2. 前日全レース予想

In [18]:
# 変更点: %autoreload がWindowsの既定エンコーディング(cp932)で読み込み失敗することがあるため、明示的にreload
import importlib
import modules.policies._score_policy as _score_policy_mod
import modules.policies as _policies_mod

importlib.reload(_score_policy_mod)
importlib.reload(_policies_mod)

from modules import policies as policies
score_policy = policies.StdScorePolicy
print('reloaded policies (explicit)')

reloaded policies (explicit)


In [14]:
# 前日全レース予想用のレースidとレース発走時刻を取得
target_race_id_list, target_race_time_list = preparing.scrape_race_id_race_time_list('20251228')
print(len(target_race_id_list))
print(len(target_race_time_list))
yesterday = True

getting race_id_list
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20251228
24
24


In [16]:
# ターゲットエンコーディング時に「馬の成績」として扱う項目
TARGET_COLS = [
        HorseResultsCols.RANK,
        HorseResultsCols.PRIZE,
        HorseResultsCols.RANK_DIFF, 
        'first_corner',
        'final_corner',
        'first_to_rank',
        'first_to_final',
        'final_to_rank',
        'time_seconds'
        ]
# horse_id列と共に、ターゲットエンコーディングの対象にする列
GROUP_COLS = [
        'course_len',
        'race_type',
        HorseResultsCols.PLACE
        ]

In [21]:
# --- 診断: 特徴量の一致率とスコアの分散を1レースで確認 ---
import numpy as np
import pandas as pd

# 例として最初のレースを対象（必要なら race_id をここで上書き）
race_id_debug = target_race_id_list[10]
race_time_debug = target_race_time_list[10]
filepath = 'data/tmp/shutuba_debug.pickle'
today = '2025/12/27'  # 前日予測の取得日（実運用に合わせる）

preparing.scrape_shutuba_table(race_id_debug, today, filepath)

if yesterday:
    pd2 = pd.read_pickle(filepath)
    pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
    if 'weather' not in pd2.columns or pd2['weather'].isnull().all():
        pd2['weather'] = '晴'
    if 'ground_state' not in pd2.columns or pd2['ground_state'].isnull().all():
        pd2['ground_state'] = '良'
    pd2.to_pickle(filepath)

shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)
shutuba_data_merger = preprocessing.ShutubaDataMerger(
    shutuba_table_processor,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS,
 )
shutuba_data_merger.merge()

feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

X_debug = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)
score_debug = keiba_ai.calc_score(X_debug, score_policy)
print('debug race_id:', race_id_debug, 'time:', race_time_debug)
print('score nunique:', score_debug['score'].nunique())
display(score_debug.sort_values('score', ascending=False).head(16))

スクレイピング完了 - レース202506050811: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'エキサイトバイオ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'エキサイトバイオ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['エキサイトバイオ', '']
  除去レコード 202506050811: 馬番='エキサイトバイオ', 体重='nan'
  除去レコード 202506050811: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

debug race_id: 202506050811 time: 15:40
score nunique: 16


Unnamed: 0,race_id,馬番,score
11,202506050811,12,1.227303
3,202506050811,4,1.144898
4,202506050811,5,0.987872
2,202506050811,3,0.849684
14,202506050811,15,0.771486
1,202506050811,2,0.760109
15,202506050811,16,0.277445
0,202506050811,1,0.241748
12,202506050811,13,0.209264
7,202506050811,8,0.075896


In [22]:
# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
today = '2022/12/27'

for race_id, race_time in zip(target_race_id_list, target_race_time_list):
    # 出馬表の取得
    preparing.scrape_shutuba_table(race_id, today, filepath)

    # 前日予想の場合
    if yesterday:
        # 前日予想の場合、馬体重を0（0）に補正
        pd2 = pd.read_pickle(filepath)
        pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
        # 前日予想の場合、天候と馬場状態が公開されていない場合はこちらを有効にする
        #pd2['weather'] = '晴'
        #pd2['ground_state'] = '良'
        pd2.to_pickle(filepath)

    # 出馬表の加工
    shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)

    # テーブルのマージ
    shutuba_data_merger = preprocessing.ShutubaDataMerger(
        shutuba_table_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
    )
    shutuba_data_merger.merge()

    # 特徴量エンジニアリング
    feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
        .add_interval()\
        .add_agedays()\
        .dumminize_ground_state()\
        .dumminize_race_type()\
        .dumminize_sex()\
        .dumminize_weather()\
        .encode_horse_id()\
        .encode_jockey_id()\
        .encode_trainer_id()\
        .encode_owner_id()\
        .encode_breeder_id()\
        .dumminize_kaisai()\
        .dumminize_around()\
        .dumminize_race_class()

    # 予測
    X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

    # 当日の出走情報テーブル（前処理前）
    df_tmp = shutuba_table_processor.raw_data[:1]

    i = 0
    for num in list(Master.PLACE_DICT.values()):
        if num == race_id[4:6]:
            print(list(Master.PLACE_DICT)[i] + race_id[10:12] + 'R ' + race_time + '発走 ' + str(df_tmp.iat[0, 12])
                + str(df_tmp.iat[0, 10]) + 'm ' + str(df_tmp.iat[0, 13]) + ' ' + str(df_tmp.iat[0, 15]))
            break
        i += 1

    print(keiba_ai.calc_score(X, policies.StdScorePolicy).sort_values('score', ascending=False))

スクレイピング完了 - レース202506050801: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'スケダチムヨウ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'スケダチムヨウ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['スケダチムヨウ', '']
  除去レコード 202506050801: 馬番='スケダチムヨウ', 体重='nan'
  除去レコード 202506050801: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山01R 09:50発走 ダート未勝利m 右 稍重
         race_id  馬番     score
3   202506050801   4  1.274665
5   202506050801   6  0.745866
13  202506050801  14  0.602869
1   202506050801   2  0.536343
14  202506050801  15  0.363335
10  202506050801  11  0.341452
9   202506050801  10  0.334541
12  202506050801  13  0.298045
4   202506050801   5  0.222207
7   202506050801   8 -0.015879
8   202506050801   9 -0.025974
2   202506050801   3 -0.114913
0   202506050801   1 -0.202890
6   202506050801   7 -0.374073
11  202506050801  12 -0.664007
15  202506050801  16 -3.321587
スクレイピング完了 - レース202506050802: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'マランドロ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'マランドロ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['マランドロ', '']
  除去レコード 202506050802: 馬番='マランドロ', 体重='nan'
  除去レコード 202506050802: 馬番='', 体重='nan'
ShutubaTableProcesso

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山02R 10:20発走 ダート未勝利m 右 稍重
         race_id  馬番     score
1   202506050802   2  0.956712
2   202506050802   3  0.905863
10  202506050802  11  0.672557
11  202506050802  12  0.500465
4   202506050802   5  0.499296
12  202506050802  13  0.427781
14  202506050802  15  0.249921
0   202506050802   1  0.113794
15  202506050802  16  0.084849
8   202506050802   9  0.078240
13  202506050802  14  0.046916
7   202506050802   8 -0.034180
9   202506050802  10 -0.072059
5   202506050802   6 -0.510799
3   202506050802   4 -0.529620
6   202506050802   7 -3.389737
スクレイピング完了 - レース202506050803: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ミスチヴマリアンヌ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ミスチヴマリアンヌ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ミスチヴマリアンヌ', '']
  除去レコード 202506050803: 馬番='ミスチヴマリアンヌ', 体重='nan'
  除去レコード 202506050803: 馬番='', 体重='nan'
Shut

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山03R 10:50発走 芝未勝利m 右 良
         race_id  馬番     score
6   202506050803   7  0.883362
4   202506050803   5  0.859126
0   202506050803   1  0.670868
12  202506050803  13  0.647918
11  202506050803  12  0.597201
3   202506050803   4  0.426883
2   202506050803   3  0.415613
14  202506050803  15  0.284061
1   202506050803   2  0.260182
13  202506050803  14  0.162936
7   202506050803   8 -0.115084
15  202506050803  16 -0.124953
8   202506050803   9 -0.413474
5   202506050803   6 -0.450457
9   202506050803  10 -0.847872
10  202506050803  11 -3.256310
スクレイピング完了 - レース202506050804: 13頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ドングラミ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ドングラミ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ドングラミ', '']
  除去レコード 202506050804: 馬番='ドングラミ', 体重='nan'
  除去レコード 202506050804: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（11件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
sep

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山04R 11:20発走 右1200m 晴 1勝クラス
         race_id  馬番     score
6   202506050804   7  1.235152
4   202506050804   5  0.950275
8   202506050804   9  0.739119
2   202506050804   3  0.288011
10  202506050804  11  0.203785
5   202506050804   6 -0.071976
1   202506050804   2 -0.072235
3   202506050804   4 -0.100521
0   202506050804   1 -0.210799
7   202506050804   8 -0.373052
9   202506050804  10 -2.587759
スクレイピング完了 - レース202506050805: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ノーチェ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ノーチェ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ノーチェ', '']
  除去レコード 202506050805: 馬番='ノーチェ', 体重='nan'
  除去レコード 202506050805: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山05R 11:50発走 ダート新馬m 右 稍重
         race_id  馬番     score
8   202506050805   9  0.853407
3   202506050805   4  0.780102
0   202506050805   1  0.680001
13  202506050805  14  0.626622
2   202506050805   3  0.470611
11  202506050805  12  0.395090
15  202506050805  16  0.361436
9   202506050805  10  0.338389
1   202506050805   2  0.237534
5   202506050805   6  0.087082
6   202506050805   7  0.050246
4   202506050805   5 -0.014670
10  202506050805  11 -0.348352
7   202506050805   8 -0.355563
12  202506050805  13 -0.814082
14  202506050805  15 -3.347853
スクレイピング完了 - レース202506050806: 20頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'アルデナイン', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'アルデナイン', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['アルデナイン', '']
  除去レコード 202506050806: 馬番='アルデナイン', 体重='nan'
  除去レコード 202506050806: 馬番='', 体重

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山06R 12:40発走 芝新馬m 右 良
         race_id  馬番     score
8   202506050806   9  1.039697
7   202506050806   8  0.987943
0   202506050806   1  0.930876
4   202506050806   5  0.896188
9   202506050806  10  0.872210
12  202506050806  13  0.864365
17  202506050806  18  0.756939
10  202506050806  11  0.732837
13  202506050806  14  0.551051
11  202506050806  12  0.440887
15  202506050806  16  0.309331
5   202506050806   6 -1.080019
16  202506050806  17 -1.151942
3   202506050806   4 -1.187985
2   202506050806   3 -1.224018
6   202506050806   7 -1.232179
14  202506050806  15 -1.249296
1   202506050806   2 -1.256886
スクレイピング完了 - レース202506050807: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'アイファーリーベン', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'アイファーリーベン', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['アイファーリーベン', '']
  除去レコード 202506050807: 馬番='アイファーリ

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山07R 13:10発走 右1200m 晴 1勝クラス
         race_id  馬番     score
11  202506050807  12  1.491520
9   202506050807  10  0.835755
15  202506050807  16  0.712908
6   202506050807   7  0.666615
2   202506050807   3  0.617245
4   202506050807   5  0.596690
12  202506050807  13  0.511179
13  202506050807  14  0.207201
1   202506050807   2  0.081390
7   202506050807   8 -0.158446
10  202506050807  11 -0.193926
5   202506050807   6 -0.295094
8   202506050807   9 -0.356856
0   202506050807   1 -0.692749
3   202506050807   4 -1.228986
14  202506050807  15 -2.794446
スクレイピング完了 - レース202506050808: 15頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', 'ジューンエオス', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', 'ジューンエオス', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ジューンエオス', '']
  除去レコード 202506050808: 馬番='ジューンエオス', 体重='nan'
  除去レコード 202506050808: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（13件のレコード）
S

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山08R 13:40発走 右1800m 晴 2勝クラス
         race_id  馬番     score
9   202506050808  10  1.075597
5   202506050808   6  0.903302
0   202506050808   1  0.664352
4   202506050808   5  0.449411
8   202506050808   9  0.339693
6   202506050808   7  0.261878
2   202506050808   3  0.252026
1   202506050808   2  0.228597
3   202506050808   4  0.126150
11  202506050808  12 -0.127346
12  202506050808  13 -0.301603
10  202506050808  11 -1.080297
7   202506050808   8 -2.791759
スクレイピング完了 - レース202506050809: 16頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'ボンドロア', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'ボンドロア', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ボンドロア', '']
  除去レコード 202506050809: 馬番='ボンドロア', 体重='nan'
  除去レコード 202506050809: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（14件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山09R 14:15発走 右2500m 晴 2勝クラス
         race_id  馬番     score
2   202506050809   3  1.824690
10  202506050809  11  1.007975
7   202506050809   8  0.500107
6   202506050809   7  0.481744
11  202506050809  12  0.355761
1   202506050809   2  0.300783
4   202506050809   5  0.166403
5   202506050809   6  0.146153
9   202506050809  10 -0.186663
8   202506050809   9 -0.220044
0   202506050809   1 -0.255919
3   202506050809   4 -0.524029
13  202506050809  14 -1.086515
12  202506050809  13 -2.510447
スクレイピング完了 - レース202506050810: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ニシノコニャック', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ニシノコニャック', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ニシノコニャック', '']
  除去レコード 202506050810: 馬番='ニシノコニャック', 体重='nan'
  除去レコード 202506050810: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: 

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山10R 14:50発走 右1200m 晴 3勝クラス
         race_id  馬番     score
0   202506050810   1  1.607767
12  202506050810  13  1.011835
6   202506050810   7  0.559636
4   202506050810   5  0.455578
5   202506050810   6  0.385318
8   202506050810   9  0.213748
11  202506050810  12  0.213007
2   202506050810   3  0.129405
9   202506050810  10  0.077670
10  202506050810  11 -0.094334
7   202506050810   8 -0.107509
15  202506050810  16 -0.117534
3   202506050810   4 -0.174504
14  202506050810  15 -0.411159
13  202506050810  14 -0.580781
1   202506050810   2 -3.168145
スクレイピング完了 - レース202506050811: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'エキサイトバイオ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'エキサイトバイオ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['エキサイトバイオ', '']
  除去レコード 202506050811: 馬番='エキサイトバイオ', 体重='nan'
  除去レコード 202506050811: 馬番='', 体重='nan'
Shutub

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山11R 15:40発走 右2500m 晴 G1
         race_id  馬番     score
14  202506050811  15  1.291923
11  202506050811  12  1.061950
15  202506050811  16  0.839934
7   202506050811   8  0.798612
4   202506050811   5  0.589616
5   202506050811   6  0.561567
2   202506050811   3  0.347552
0   202506050811   1  0.250084
3   202506050811   4  0.184941
1   202506050811   2  0.124165
6   202506050811   7 -0.116294
9   202506050811  10 -0.247378
12  202506050811  13 -0.290759
10  202506050811  11 -1.590740
13  202506050811  14 -1.771095
8   202506050811   9 -2.034078
スクレイピング完了 - レース202506050812: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ペリファーニア', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ペリファーニア', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ペリファーニア', '']
  除去レコード 202506050812: 馬番='ペリファーニア', 体重='nan'
  除去レコード 202506050812: 馬番='', 体重='nan'
ShutubaTableP

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山12R 16:25発走 右1600m 晴 3勝クラス
         race_id  馬番     score
12  202506050812  13  1.815496
7   202506050812   8  0.789258
0   202506050812   1  0.736610
14  202506050812  15  0.529309
15  202506050812  16  0.498774
1   202506050812   2  0.279770
9   202506050812  10  0.230967
2   202506050812   3  0.113652
5   202506050812   6  0.063055
11  202506050812  12  0.041879
13  202506050812  14 -0.068247
8   202506050812   9 -0.287047
6   202506050812   7 -0.564616
10  202506050812  11 -0.576746
3   202506050812   4 -0.653634
4   202506050812   5 -2.948479
スクレイピング完了 - レース202509050801: 15頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', 'キングズテイル', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', 'キングズテイル', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['キングズテイル', '']
  除去レコード 202509050801: 馬番='キングズテイル', 体重='nan'
  除去レコード 202509050801: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（13件のレコード）
S

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神01R 09:35発走 ダート未勝利m 右 稍重
         race_id  馬番     score
10  202509050801  11  0.784030
7   202509050801   8  0.727501
6   202509050801   7  0.664186
9   202509050801  10  0.653756
8   202509050801   9  0.521634
12  202509050801  13  0.499823
1   202509050801   2  0.447248
0   202509050801   1  0.269956
5   202509050801   6 -0.229260
11  202509050801  12 -0.343926
3   202509050801   4 -0.496317
4   202509050801   5 -0.597082
2   202509050801   3 -2.901549
スクレイピング完了 - レース202509050802: 19頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', 'スイートポメロ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', 'スイートポメロ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['スイートポメロ', '']
  除去レコード 202509050802: 馬番='スイートポメロ', 体重='nan'
  除去レコード 202509050802: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（17件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating ho

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神02R 10:05発走 芝未勝利m 右 良
         race_id  馬番     score
14  202509050802  15  1.138575
11  202509050802  12  0.911090
4   202509050802   5  0.821736
10  202509050802  11  0.627679
6   202509050802   7  0.526376
13  202509050802  14  0.420326
12  202509050802  13  0.404287
9   202509050802  10  0.287186
7   202509050802   8  0.163312
1   202509050802   2  0.152382
16  202509050802  17  0.099186
5   202509050802   6  0.050109
3   202509050802   4 -0.207172
15  202509050802  16 -0.253350
0   202509050802   1 -0.271540
2   202509050802   3 -2.401603
8   202509050802   9 -2.468579
スクレイピング完了 - レース202509050803: 20頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'ストームゲイル', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'ストームゲイル', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ストームゲイル', '']
  除去レコード 202509050803: 馬番='ストームゲイル', 体重='nan'

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神03R 10:35発走 芝未勝利m 右 良
         race_id  馬番     score
7   202509050803   8  1.185983
12  202509050803  13  0.984500
15  202509050803  16  0.768356
8   202509050803   9  0.750448
1   202509050803   2  0.639334
6   202509050803   7  0.593758
4   202509050803   5  0.578330
14  202509050803  15  0.549843
3   202509050803   4  0.306635
9   202509050803  10  0.277981
5   202509050803   6  0.133699
0   202509050803   1  0.123327
13  202509050803  14  0.114336
10  202509050803  11 -0.125850
16  202509050803  17 -1.693045
11  202509050803  12 -1.705090
17  202509050803  18 -1.713997
2   202509050803   3 -1.768547
スクレイピング完了 - レース202509050804: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'シュクルリー', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'シュクルリー', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['シュクルリー', '']
  除去レコード 202509050804: 馬番='シュクルリー', 体重='n

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神04R 11:05発走 ダート新馬m 右 稍重
         race_id  馬番     score
14  202509050804  15  0.974204
0   202509050804   1  0.765544
8   202509050804   9  0.739838
3   202509050804   4  0.675929
10  202509050804  11  0.465955
9   202509050804  10  0.226209
4   202509050804   5  0.191068
1   202509050804   2  0.094999
12  202509050804  13  0.089794
2   202509050804   3  0.040400
11  202509050804  12  0.007374
5   202509050804   6 -0.022382
7   202509050804   8 -0.080849
15  202509050804  16 -0.159589
6   202509050804   7 -0.577615
13  202509050804  14 -3.430877
スクレイピング完了 - レース202509050805: 14頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']
すべての馬番が有効です
ShutubaTableProcessor: 馬番クリーンアップ開始（14件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神05R 11:35発走 2970障害m 晴 障害
         race_id  馬番     score
7   202509050805   8  1.558386
9   202509050805  10  1.119427
13  202509050805  14  0.638212
2   202509050805   3  0.575317
1   202509050805   2  0.525122
0   202509050805   1  0.220387
8   202509050805   9  0.108509
12  202509050805  13  0.091575
5   202509050805   6  0.081611
6   202509050805   7 -0.303117
3   202509050805   4 -0.595482
10  202509050805  11 -0.705578
11  202509050805  12 -0.710606
4   202509050805   5 -2.603764
スクレイピング完了 - レース202509050806: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'カステッロトゥーレ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'カステッロトゥーレ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['カステッロトゥーレ', '']
  除去レコード 202509050806: 馬番='カステッロトゥーレ', 体重='nan'
  除去レコード 202509050806: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神06R 12:25発走 芝新馬m 右 良
         race_id  馬番     score
9   202509050806  10  0.850963
14  202509050806  15  0.810479
10  202509050806  11  0.625763
11  202509050806  12  0.564545
5   202509050806   6  0.563171
3   202509050806   4  0.560526
2   202509050806   3  0.458541
4   202509050806   5  0.264819
15  202509050806  16  0.242360
1   202509050806   2  0.145451
6   202509050806   7  0.051377
8   202509050806   9  0.013754
13  202509050806  14 -0.072555
12  202509050806  13 -0.207856
7   202509050806   8 -2.415627
0   202509050806   1 -2.455710
スクレイピング完了 - レース202509050807: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'メトロポリターナ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'メトロポリターナ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['メトロポリターナ', '']
  除去レコード 202509050807: 馬番='メトロポリターナ', 体重='nan'
  除去レコード 202509050807: 馬番='', 体重='nan'
ShutubaTable

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神07R 12:55発走 右1400m 晴 1勝クラス
         race_id  馬番     score
13  202509050807  14  1.087018
4   202509050807   5  1.008512
5   202509050807   6  0.916567
8   202509050807   9  0.747918
10  202509050807  11  0.449739
3   202509050807   4  0.239768
2   202509050807   3  0.231519
6   202509050807   7  0.084844
11  202509050807  12  0.010942
1   202509050807   2 -0.054838
0   202509050807   1 -0.095879
15  202509050807  16 -0.129399
14  202509050807  15 -0.156427
7   202509050807   8 -0.187953
12  202509050807  13 -0.987249
9   202509050807  10 -3.165082
スクレイピング完了 - レース202509050808: 15頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', 'ジョウショーパワー', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', 'ジョウショーパワー', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ジョウショーパワー', '']
  除去レコード 202509050808: 馬番='ジョウショーパワー', 体重='nan'
  除去レコード 202509050808: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（13件

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神08R 13:25発走 右1800m 晴 2勝クラス
         race_id  馬番     score
7   202509050808   8  1.208400
6   202509050808   7  0.654199
1   202509050808   2  0.494803
3   202509050808   4  0.448231
4   202509050808   5  0.349038
9   202509050808  10  0.314615
10  202509050808  11  0.305892
2   202509050808   3  0.026522
8   202509050808   9  0.013828
5   202509050808   6 -0.064326
0   202509050808   1 -0.327589
11  202509050808  12 -0.412045
12  202509050808  13 -3.011568
スクレイピング完了 - レース202509050809: 14頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'クイックバイオ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'クイックバイオ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['クイックバイオ', '']
  除去レコード 202509050809: 馬番='クイックバイオ', 体重='nan'
  除去レコード 202509050809: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（12件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神09R 14:00発走 右1600m 晴 2勝クラス
         race_id  馬番     score
1   202509050809   2  1.623542
2   202509050809   3  0.619943
5   202509050809   6  0.497482
11  202509050809  12  0.469463
8   202509050809   9  0.458966
9   202509050809  10  0.336695
0   202509050809   1  0.328097
3   202509050809   4  0.145006
6   202509050809   7  0.112501
4   202509050809   5 -1.419749
7   202509050809   8 -1.574931
10  202509050809  11 -1.597016
スクレイピング完了 - レース202509050810: 13頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ピンクジン', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ピンクジン', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ピンクジン', '']
  除去レコード 202509050810: 馬番='ピンクジン', 体重='nan'
  除去レコード 202509050810: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（11件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神10R 14:35発走 右2000m 晴 3勝クラス
         race_id  馬番     score
4   202509050810   5  1.511351
3   202509050810   4  1.290976
9   202509050810  10  0.590681
1   202509050810   2  0.544919
5   202509050810   6  0.238781
0   202509050810   1  0.005812
6   202509050810   7 -0.178405
8   202509050810   9 -0.195849
2   202509050810   3 -0.770734
10  202509050810  11 -1.333300
7   202509050810   8 -1.704231
スクレイピング完了 - レース202509050811: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'コンティノアール', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'コンティノアール', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['コンティノアール', '']
  除去レコード 202509050811: 馬番='コンティノアール', 体重='nan'
  除去レコード 202509050811: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神11R 15:15発走 右1400m 晴 オープン
         race_id  馬番     score
15  202509050811  16  1.376757
0   202509050811   1  1.236019
5   202509050811   6  0.960612
14  202509050811  15  0.944448
11  202509050811  12  0.834875
13  202509050811  14  0.676542
6   202509050811   7  0.387383
9   202509050811  10  0.149063
12  202509050811  13 -0.173132
4   202509050811   5 -0.317355
7   202509050811   8 -0.592722
2   202509050811   3 -0.633342
8   202509050811   9 -0.659598
3   202509050811   4 -0.866129
1   202509050811   2 -1.012065
10  202509050811  11 -2.311357
スクレイピング完了 - レース202509050812: 12頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'ヒーローインチーフ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'ヒーローインチーフ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ヒーローインチーフ', '']
  除去レコード 202509050812: 馬番='ヒーローインチーフ', 体重='nan'
  除去レコード 202509050812: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（10件のレコード）
ShutubaTableProcessor: すべての馬番が

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神12R 16:05発走 右芝m 晴 3勝クラス
        race_id  馬番     score
2  202509050812   3  1.500087
6  202509050812   7  0.603158
0  202509050812   1  0.455724
3  202509050812   4  0.339084
5  202509050812   6  0.255268
7  202509050812   8  0.178277
1  202509050812   2 -0.063610
9  202509050812  10 -0.297264
8  202509050812   9 -0.630575
4  202509050812   5 -2.340149


In [53]:
# ============================================================================
# 全レース前日予測（特徴量整列は keiba_ai.calc_score に委譲）
# ============================================================================

import time
import numpy as np
import pandas as pd

print("=== 全レース前日予測開始 ===")
print(f"対象レース数: {len(target_race_id_list)}")
print(f"前日予想モード: {'ON' if yesterday else 'OFF'}")
print("=" * 50)

# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
today = '2025/12/26'

# 全レースの予測結果を格納
all_predictions = {}
error_count = 0

for idx, (race_id, race_time) in enumerate(zip(target_race_id_list, target_race_time_list), 1):
    try:
        print(f"\n[{idx}/{len(target_race_id_list)}] レース処理中: {race_id}")
        
        # サーバー負荷軽減（必須）
        time.sleep(1)
        
        # 出馬表の取得
        preparing.scrape_shutuba_table(race_id, today, filepath)

        # 前日予想の場合
        if yesterday:
            # 前日予想の場合、馬体重を0（0）に補正
            pd2 = pd.read_pickle(filepath)
            pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
            # 前日予想の場合、天候と馬場状態が公開されていない場合はデフォルト値を設定
            if 'weather' not in pd2.columns or pd2['weather'].isnull().all():
                pd2['weather'] = '晴'
            if 'ground_state' not in pd2.columns or pd2['ground_state'].isnull().all():
                pd2['ground_state'] = '良'
            pd2.to_pickle(filepath)

        # 出馬表の加工
        shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)

        # テーブルのマージ
        shutuba_data_merger = preprocessing.ShutubaDataMerger(
            shutuba_table_processor,
            horse_results_processor,
            horse_info_processor,
            peds_processor,
            target_cols=TARGET_COLS,
            group_cols=GROUP_COLS
)
        shutuba_data_merger.merge()

        # 特徴量エンジニアリング
        feature_enginnering_shutuba = (
            preprocessing.FeatureEngineering(shutuba_data_merger)
            .add_interval()
            .add_agedays()
            .dumminize_ground_state()
            .dumminize_race_type()
            .dumminize_sex()
            .dumminize_weather()
            .encode_horse_id()
            .encode_jockey_id()
            .encode_trainer_id()
            .encode_owner_id()
            .encode_breeder_id()
            .dumminize_kaisai()
            .dumminize_around()
            .dumminize_race_class()
        )

        # 予測（整列・型変換は calc_score 側のポリシーで吸収）
        X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1, errors='ignore')

        # 当日の出走情報テーブル（前処理前）
        df_tmp = shutuba_table_processor.raw_data[:1]

        # レース情報の表示
        race_info = ""
        for place_name, num in Master.PLACE_DICT.items():
            if num == race_id[4:6]:
                race_info = (
                    f"{place_name}{race_id[10:12]}R {race_time}発走 "
                    f"{df_tmp.iat[0, 12]}{df_tmp.iat[0, 10]}m "
                    f"{df_tmp.iat[0, 13]} {df_tmp.iat[0, 15]}"
                )
                print(race_info)
                break

        # 予測実行
        score_result = keiba_ai.calc_score(X, score_policy).sort_values('score', ascending=False)
        print("score nunique:", score_result['score'].nunique())

        # 上位馬のみを表示（簡潔化）
        top_horses = score_result.head(5)
        print("TOP5予想:")
        for rank, (_, row) in enumerate(top_horses.iterrows(), 1):
            print(f"  {rank}位: {row['馬番']}番 (スコア: {row['score']:.3f})")

        # 結果を保存
        all_predictions[race_id] = {
            'race_info': race_info,
            'predictions': score_result,
            'race_time': race_time
        }

        print(f"✅ {race_id} 予測完了")

    except Exception as e:
        error_count += 1
        print(f"❌ {race_id} 予測エラー: {str(e)}")
        # エラーが発生したレースをスキップして続行
        continue

print(f"\n=== 全レース予測完了 ===")
print(f"成功: {len(all_predictions)}/{len(target_race_id_list)} レース")
print(f"エラー: {error_count} レース")

# 最終結果のサマリー表示
if all_predictions:
    print(f"\n=== 本日の予想結果一覧 ===")
    for race_id, result in all_predictions.items():
        print(f"\n{result['race_info']}")
        top3 = result['predictions'].head(3)

        for rank, (_, row) in enumerate(top3.iterrows(), 1):
            print(f"  {rank}位予想: {row['馬番']}番 (スコア: {row['score']:.3f})")

    print(f"\n🎯 {len(all_predictions)}レースの予測が完了しました！")
else:
    print("❌ 予測に成功したレースがありません。")

=== 全レース前日予測開始 ===
対象レース数: 24
前日予想モード: ON

[1/24] レース処理中: 202506050701
スクレイピング完了 - レース202506050701: 17頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'リオクリスハーレー', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'リオクリスハーレー', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['リオクリスハーレー', '']
  除去レコード 202506050701: 馬番='リオクリスハーレー', 体重='nan'
  除去レコード 202506050701: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（15件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山01R 10:00発走 ダート未勝利m 右 重
score nunique: 15
TOP5予想:
  1位: 10番 (スコア: 2.815)
  2位: 15番 (スコア: 0.654)
  3位: 4番 (スコア: 0.523)
  4位: 13番 (スコア: 0.481)
  5位: 5番 (スコア: 0.460)
✅ 202506050701 予測完了

[2/24] レース処理中: 202506050702


  else:


スクレイピング完了 - レース202506050702: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ゴットタレント', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ゴットタレント', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ゴットタレント', '']
  除去レコード 202506050702: 馬番='ゴットタレント', 体重='nan'
  除去レコード 202506050702: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山02R 10:30発走 ダート未勝利m 右 重
score nunique: 16
TOP5予想:
  1位: 8番 (スコア: 1.599)
  2位: 9番 (スコア: 1.457)
  3位: 7番 (スコア: 1.412)
  4位: 4番 (スコア: 0.463)
  5位: 1番 (スコア: 0.452)
✅ 202506050702 予測完了

[3/24] レース処理中: 202506050703


  else:


スクレイピング完了 - レース202506050703: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'チュラヴェール', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'チュラヴェール', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['チュラヴェール', '']
  除去レコード 202506050703: 馬番='チュラヴェール', 体重='nan'
  除去レコード 202506050703: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山03R 11:00発走 ダート未勝利m 右 重
score nunique: 16
TOP5予想:
  1位: 16番 (スコア: 1.805)
  2位: 1番 (スコア: 1.789)
  3位: 11番 (スコア: 1.079)
  4位: 8番 (スコア: 0.778)
  5位: 4番 (スコア: 0.429)
✅ 202506050703 予測完了

[4/24] レース処理中: 202506050704


  else:


スクレイピング完了 - レース202506050704: 20頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'アイスアンドスノー', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'アイスアンドスノー', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['アイスアンドスノー', '']
  除去レコード 202506050704: 馬番='アイスアンドスノー', 体重='nan'
  除去レコード 202506050704: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（18件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山04R 11:30発走 芝未勝利m 右 稍重
score nunique: 18
TOP5予想:
  1位: 7番 (スコア: 1.863)
  2位: 5番 (スコア: 1.618)
  3位: 13番 (スコア: 1.612)
  4位: 12番 (スコア: 1.395)
  5位: 4番 (スコア: 0.491)
✅ 202506050704 予測完了

[5/24] レース処理中: 202506050705


  else:


スクレイピング完了 - レース202506050705: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ライトハウス', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ライトハウス', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ライトハウス', '']
  除去レコード 202506050705: 馬番='ライトハウス', 体重='nan'
  除去レコード 202506050705: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山05R 12:20発走 芝新馬m 右 稍重
score nunique: 16
TOP5予想:
  1位: 9番 (スコア: 0.642)
  2位: 5番 (スコア: 0.607)
  3位: 1番 (スコア: 0.554)
  4位: 16番 (スコア: 0.453)
  5位: 7番 (スコア: 0.440)
✅ 202506050705 予測完了

[6/24] レース処理中: 202506050706


  else:


スクレイピング完了 - レース202506050706: 16頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'アイアンクラッド', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'アイアンクラッド', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['アイアンクラッド', '']
  除去レコード 202506050706: 馬番='アイアンクラッド', 体重='nan'
  除去レコード 202506050706: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（14件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山06R 12:50発走 ダート新馬m 右 重
score nunique: 14
TOP5予想:
  1位: 4番 (スコア: 0.914)
  2位: 7番 (スコア: 0.864)
  3位: 14番 (スコア: 0.452)
  4位: 9番 (スコア: 0.442)
  5位: 6番 (スコア: 0.331)
✅ 202506050706 予測完了

[7/24] レース処理中: 202506050707


  else:


スクレイピング完了 - レース202506050707: 20頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'ツインピークス', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', 'ツインピークス', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ツインピークス', '']
  除去レコード 202506050707: 馬番='ツインピークス', 体重='nan'
  除去レコード 202506050707: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（18件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山07R 13:20発走 右2200m 晴 1勝クラス
score nunique: 18
TOP5予想:
  1位: 2番 (スコア: 2.381)
  2位: 9番 (スコア: 1.683)
  3位: 11番 (スコア: 1.228)
  4位: 14番 (スコア: 0.981)
  5位: 16番 (スコア: 0.452)
✅ 202506050707 予測完了

[8/24] レース処理中: 202506050708


  else:


スクレイピング完了 - レース202506050708: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ホークライト', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ホークライト', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ホークライト', '']
  除去レコード 202506050708: 馬番='ホークライト', 体重='nan'
  除去レコード 202506050708: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山08R 13:50発走 右1800m 晴 1勝クラス
score nunique: 16
TOP5予想:
  1位: 2番 (スコア: 2.464)
  2位: 1番 (スコア: 1.780)
  3位: 8番 (スコア: 0.831)
  4位: 14番 (スコア: 0.493)
  5位: 5番 (スコア: 0.166)
✅ 202506050708 予測完了

[9/24] レース処理中: 202506050709


  else:


スクレイピング完了 - レース202506050709: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ゴキゲンサン', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ゴキゲンサン', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ゴキゲンサン', '']
  除去レコード 202506050709: 馬番='ゴキゲンサン', 体重='nan'
  除去レコード 202506050709: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山09R 14:20発走 右1200m 晴 2勝クラス
score nunique: 16
TOP5予想:
  1位: 6番 (スコア: 1.712)
  2位: 13番 (スコア: 1.502)
  3位: 9番 (スコア: 1.472)
  4位: 11番 (スコア: 0.976)
  5位: 7番 (スコア: 0.500)
✅ 202506050709 予測完了

[10/24] レース処理中: 202506050710


  else:


スクレイピング完了 - レース202506050710: 10頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
すべての馬番が有効です
ShutubaTableProcessor: 馬番クリーンアップ開始（10件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山10R 15:00発走 晴障害m 稍重 障害
score nunique: 10
TOP5予想:
  1位: 1番 (スコア: 1.780)
  2位: 6番 (スコア: 1.345)
  3位: 7番 (スコア: 0.335)
  4位: 3番 (スコア: 0.268)
  5位: 8番 (スコア: 0.093)
✅ 202506050710 予測完了

[11/24] レース処理中: 202506050711


  else:


スクレイピング完了 - レース202506050711: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ノチェセラーダ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ノチェセラーダ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ノチェセラーダ', '']
  除去レコード 202506050711: 馬番='ノチェセラーダ', 体重='nan'
  除去レコード 202506050711: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山11R 15:45発走 右2000m 晴 G1
score nunique: 16
TOP5予想:
  1位: 3番 (スコア: 2.047)
  2位: 13番 (スコア: 0.919)
  3位: 16番 (スコア: 0.864)
  4位: 10番 (スコア: 0.404)
  5位: 7番 (スコア: 0.375)
✅ 202506050711 予測完了

[12/24] レース処理中: 202506050712


  else:


スクレイピング完了 - レース202506050712: 16頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'スピードリッチ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'スピードリッチ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['スピードリッチ', '']
  除去レコード 202506050712: 馬番='スピードリッチ', 体重='nan'
  除去レコード 202506050712: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（14件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山12R 16:25発走 右2500m 晴 3勝クラス
score nunique: 14
TOP5予想:
  1位: 6番 (スコア: 2.166)
  2位: 14番 (スコア: 1.290)
  3位: 1番 (スコア: 1.222)
  4位: 7番 (スコア: 0.171)
  5位: 8番 (スコア: 0.050)
✅ 202506050712 予測完了

[13/24] レース処理中: 202509050701


  else:


スクレイピング完了 - レース202509050701: 17頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'ザキノフレンズ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'ザキノフレンズ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ザキノフレンズ', '']
  除去レコード 202509050701: 馬番='ザキノフレンズ', 体重='nan'
  除去レコード 202509050701: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（15件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神01R 09:45発走 ダート未勝利m 右 重
score nunique: 15
TOP5予想:
  1位: 6番 (スコア: 2.828)
  2位: 8番 (スコア: 0.806)
  3位: 4番 (スコア: 0.439)
  4位: 13番 (スコア: 0.214)
  5位: 10番 (スコア: 0.202)
✅ 202509050701 予測完了

[14/24] レース処理中: 202509050702


  else:


スクレイピング完了 - レース202509050702: 13頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'テーオータウンズ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'テーオータウンズ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['テーオータウンズ', '']
  除去レコード 202509050702: 馬番='テーオータウンズ', 体重='nan'
  除去レコード 202509050702: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（11件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神02R 10:15発走 ダート未勝利m 右 重
score nunique: 11
TOP5予想:
  1位: 3番 (スコア: 1.872)
  2位: 8番 (スコア: 1.210)
  3位: 7番 (スコア: 0.663)
  4位: 9番 (スコア: 0.220)
  5位: 2番 (スコア: 0.157)
✅ 202509050702 予測完了

[15/24] レース処理中: 202509050703


  else:


スクレイピング完了 - レース202509050703: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'カフジクロミエ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'カフジクロミエ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['カフジクロミエ', '']
  除去レコード 202509050703: 馬番='カフジクロミエ', 体重='nan'
  除去レコード 202509050703: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神03R 10:45発走 ダート未勝利m 右 重
score nunique: 16
TOP5予想:
  1位: 7番 (スコア: 1.836)
  2位: 6番 (スコア: 1.075)
  3位: 15番 (スコア: 0.905)
  4位: 1番 (スコア: 0.745)
  5位: 10番 (スコア: 0.738)
✅ 202509050703 予測完了

[16/24] レース処理中: 202509050704


  else:


スクレイピング完了 - レース202509050704: 17頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'ヴィシュヴァナート', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'ヴィシュヴァナート', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ヴィシュヴァナート', '']
  除去レコード 202509050704: 馬番='ヴィシュヴァナート', 体重='nan'
  除去レコード 202509050704: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（15件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神04R 11:15発走 芝未勝利m 右 良
score nunique: 15
TOP5予想:
  1位: 12番 (スコア: 1.523)
  2位: 7番 (スコア: 1.499)
  3位: 4番 (スコア: 1.432)
  4位: 6番 (スコア: 0.795)
  5位: 15番 (スコア: 0.725)
✅ 202509050704 予測完了

[17/24] レース処理中: 202509050705


  else:


スクレイピング完了 - レース202509050705: 16頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'ミトノボタン', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'ミトノボタン', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ミトノボタン', '']
  除去レコード 202509050705: 馬番='ミトノボタン', 体重='nan'
  除去レコード 202509050705: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（14件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神05R 12:05発走 芝新馬m 右 良
score nunique: 14
TOP5予想:
  1位: 12番 (スコア: 0.857)
  2位: 11番 (スコア: 0.698)
  3位: 2番 (スコア: 0.596)
  4位: 9番 (スコア: 0.521)
  5位: 5番 (スコア: 0.467)
✅ 202509050705 予測完了

[18/24] レース処理中: 202509050706


  else:


スクレイピング完了 - レース202509050706: 13頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ワンダーデリエール', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ワンダーデリエール', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ワンダーデリエール', '']
  除去レコード 202509050706: 馬番='ワンダーデリエール', 体重='nan'
  除去レコード 202509050706: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（11件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神06R 12:35発走 ダート新馬m 右 重
score nunique: 11
TOP5予想:
  1位: 9番 (スコア: 0.650)
  2位: 11番 (スコア: 0.532)
  3位: 4番 (スコア: 0.475)
  4位: 8番 (スコア: 0.375)
  5位: 2番 (スコア: 0.348)
✅ 202509050706 予測完了

[19/24] レース処理中: 202509050707


  else:


スクレイピング完了 - レース202509050707: 17頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'メイショウピリカ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 'メイショウピリカ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['メイショウピリカ', '']
  除去レコード 202509050707: 馬番='メイショウピリカ', 体重='nan'
  除去レコード 202509050707: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（15件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神07R 13:05発走 右1800m 晴 1勝クラス
score nunique: 15
TOP5予想:
  1位: 7番 (スコア: 2.213)
  2位: 6番 (スコア: 1.258)
  3位: 10番 (スコア: 1.102)
  4位: 9番 (スコア: 1.099)
  5位: 15番 (スコア: 0.158)
✅ 202509050707 予測完了

[20/24] レース処理中: 202509050708


  else:


スクレイピング完了 - レース202509050708: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'クリノキングマン', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'クリノキングマン', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['クリノキングマン', '']
  除去レコード 202509050708: 馬番='クリノキングマン', 体重='nan'
  除去レコード 202509050708: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神08R 13:35発走 右1400m 晴 2勝クラス
score nunique: 16
TOP5予想:
  1位: 3番 (スコア: 1.686)
  2位: 6番 (スコア: 1.427)
  3位: 10番 (スコア: 1.265)
  4位: 2番 (スコア: 0.762)
  5位: 8番 (スコア: 0.321)
✅ 202509050708 予測完了

[21/24] レース処理中: 202509050709


  else:


スクレイピング完了 - レース202509050709: 12頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'ゴーゴーリチャード', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'ゴーゴーリチャード', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ゴーゴーリチャード', '']
  除去レコード 202509050709: 馬番='ゴーゴーリチャード', 体重='nan'
  除去レコード 202509050709: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（10件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神09R 14:05発走 右1400m 晴 1勝クラス
score nunique: 10
TOP5予想:
  1位: 9番 (スコア: 1.028)
  2位: 2番 (スコア: 0.802)
  3位: 1番 (スコア: 0.760)
  4位: 8番 (スコア: 0.744)
  5位: 10番 (スコア: 0.594)
✅ 202509050709 予測完了

[22/24] レース処理中: 202509050710


  else:


スクレイピング完了 - レース202509050710: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'メイショウソウタ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'メイショウソウタ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['メイショウソウタ', '']
  除去レコード 202509050710: 馬番='メイショウソウタ', 体重='nan'
  除去レコード 202509050710: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神10R 14:45発走 右1800m 晴 3勝クラス
score nunique: 16
TOP5予想:
  1位: 8番 (スコア: 1.875)
  2位: 2番 (スコア: 1.580)
  3位: 3番 (スコア: 1.238)
  4位: 12番 (スコア: 1.060)
  5位: 13番 (スコア: 0.308)
✅ 202509050710 予測完了

[23/24] レース処理中: 202509050711


  else:


スクレイピング完了 - レース202509050711: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'エイシンフェンサー', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'エイシンフェンサー', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['エイシンフェンサー', '']
  除去レコード 202509050711: 馬番='エイシンフェンサー', 体重='nan'
  除去レコード 202509050711: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神11R 15:25発走 右1400m 晴 G2
score nunique: 16
TOP5予想:
  1位: 7番 (スコア: 1.886)
  2位: 12番 (スコア: 1.337)
  3位: 5番 (スコア: 1.223)
  4位: 10番 (スコア: 0.557)
  5位: 2番 (スコア: 0.495)
✅ 202509050711 予測完了

[24/24] レース処理中: 202509050712


  else:


スクレイピング完了 - レース202509050712: 11頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'ミュージシャン', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'ミュージシャン', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ミュージシャン', '']
  除去レコード 202509050712: 馬番='ミュージシャン', 体重='nan'
  除去レコード 202509050712: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（9件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神12R 16:05発走 右2000m 晴 2勝クラス
score nunique: 9
TOP5予想:
  1位: 1番 (スコア: 1.281)
  2位: 4番 (スコア: 1.163)
  3位: 3番 (スコア: 0.644)
  4位: 6番 (スコア: 0.308)
  5位: 9番 (スコア: 0.035)
✅ 202509050712 予測完了

=== 全レース予測完了 ===
成功: 24/24 レース
エラー: 0 レース

=== 本日の予想結果一覧 ===

中山01R 10:00発走 ダート未勝利m 右 重
  1位予想: 10番 (スコア: 2.815)
  2位予想: 15番 (スコア: 0.654)
  3位予想: 4番 (スコア: 0.523)

中山02R 10:30発走 ダート未勝利m 右 重
  1位予想: 8番 (スコア: 1.599)
  2位予想: 9番 (スコア: 1.457)
  3位予想: 7番 (スコア: 1.412)

中山03R 11:00発走 ダート未勝利m 右 重
  1位予想: 16番 (スコア: 1.805)
  2位予想: 1番 (スコア: 1.789)
  3位予想: 11番 (スコア: 1.079)

中山04R 11:30発走 芝未勝利m 右 稍重
  1位予想: 7番 (スコア: 1.863)
  2位予想: 5番 (スコア: 1.618)
  3位予想: 13番 (スコア: 1.612)

中山05R 12:20発走 芝新馬m 右 稍重
  1位予想: 9番 (スコア: 0.642)
  2位予想: 5番 (スコア: 0.607)
  3位予想: 1番 (スコア: 0.554)

中山06R 12:50発走 ダート新馬m 右 重
  1位予想: 4番 (スコア: 0.914)
  2位予想: 7番 (スコア: 0.864)
  3位予想: 14番 (スコア: 0.452)

中山07R 13:20発走 右2200m 晴 1勝クラス
  1位予想: 2番 (スコア: 2.381)
  2位予想: 9番 (スコア: 1.683)
  3位予想: 11番 (スコア: 1.228)

中山08R 13:50発走 右1800m 晴 1勝クラス
  1位予想: 2番 (スコア: 2.464)
 

  else:


## 6.3. レース直前データ処理（当日レース予想）

In [23]:
%autoreload

In [30]:
# 馬体重の発表されたレースID、レース時刻を取得（レース当日用）
target_race_id_list, target_race_time_list = preparing.create_active_race_id_list()

# レース時刻順にソート
race_data = list(zip(target_race_id_list, target_race_time_list))
race_data_sorted = sorted(race_data, key=lambda x: x[1])  # 時刻でソート
target_race_id_list = [race_id for race_id, race_time in race_data_sorted]
target_race_time_list = [race_time for race_id, race_time in race_data_sorted]

print("ソート後のレースID:", target_race_id_list)
print("ソート後のレース時刻:", target_race_time_list)

20251228 11:08
getting race_id_list
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20251228
ソート後のレースID: ['202506050804', '202509050805', '202506050805']
ソート後のレース時刻: ['11:20', '11:35', '11:50']


In [31]:
# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
#today = '2022/10/01'
today = datetime.datetime.now().date().strftime('%Y/%m/%d')

for race_id, race_time in zip(target_race_id_list, target_race_time_list):
    # 出馬表の取得
    preparing.scrape_shutuba_table(race_id, today, filepath)

    # 出馬表の加工
    shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)
    # 馬番クリーンアップを含む前処理を実行
    # shutuba_table_processor.process()

    # テーブルのマージ
    shutuba_data_merger = preprocessing.ShutubaDataMerger(
        shutuba_table_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
    )
    shutuba_data_merger.merge()

    # 特徴量エンジニアリング
    feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
        .add_interval()\
        .add_agedays()\
        .dumminize_ground_state()\
        .dumminize_race_type()\
        .dumminize_sex()\
        .dumminize_weather()\
        .encode_horse_id()\
        .encode_jockey_id()\
        .encode_trainer_id()\
        .encode_owner_id()\
        .encode_breeder_id()\
        .dumminize_kaisai()\
        .dumminize_around()\
        .dumminize_race_class()

    # 予測
    X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

    # 当日の出走情報テーブル（前処理前）
    df_tmp = shutuba_table_processor.raw_data[:1]
    #df_tmp['race_type'] tmp.iat[0, 12]
    #df_tmp['around'] 13
    #df_tmp['weather'] 14
    #df_tmp['ground_state'] 15
    #df_tmp['race_class']16

    i = 0
    for num in list(Master.PLACE_DICT.values()):
        if num == race_id[4:6]:
            print(list(Master.PLACE_DICT)[i] + race_id[10:12] + 'R ' + race_time + '発走 ' + str(df_tmp.iat[0, 12])
                + str(df_tmp.iat[0, 10]) + 'm ' + str(df_tmp.iat[0, 13]) + ' ' + str(df_tmp.iat[0, 15]))
            break
        i += 1

    print(keiba_ai.calc_score(X, policies.StdScorePolicy).sort_values('score', ascending=False))

スクレイピング完了 - レース202506050804: 13頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ドングラミ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'ドングラミ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ドングラミ', '']
  除去レコード 202506050804: 馬番='ドングラミ', 体重='nan'
  除去レコード 202506050804: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（11件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山04R 11:20発走 右1200m 晴 1勝クラス
         race_id  馬番     score
3   202506050804   4  1.265241
2   202506050804   3  1.230514
1   202506050804   2  0.838660
4   202506050804   5  0.598365
10  202506050804  11  0.243672
8   202506050804   9  0.028602
6   202506050804   7 -0.059290
7   202506050804   8 -0.367150
5   202506050804   6 -0.516006
0   202506050804   1 -1.496144
9   202506050804  10 -1.766463
スクレイピング完了 - レース202509050805: 14頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']
すべての馬番が有効です
ShutubaTableProcessor: 馬番クリーンアップ開始（14件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

阪神05R 11:35発走 2970障害m 晴 障害
         race_id  馬番     score
9   202509050805  10  2.647266
3   202509050805   4  0.933475
1   202509050805   2  0.630852
7   202509050805   8  0.551808
11  202509050805  12  0.058586
8   202509050805   9 -0.058086
5   202509050805   6 -0.071929
13  202509050805  14 -0.135519
6   202509050805   7 -0.186946
12  202509050805  13 -0.338266
0   202509050805   1 -0.776775
2   202509050805   3 -0.867793
10  202509050805  11 -1.063194
4   202509050805   5 -1.323479
スクレイピング完了 - レース202506050805: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ノーチェ', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ノーチェ', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ノーチェ', '']
  除去レコード 202506050805: 馬番='ノーチェ', 体重='nan'
  除去レコード 202506050805: 馬番='', 体重='nan'
ShutubaTableProcessor: 馬番クリーンアップ開始（16件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separa

  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

中山05R 11:50発走 ダート新馬m 右 良
         race_id  馬番     score
8   202506050805   9  0.893227
11  202506050805  12  0.827784
13  202506050805  14  0.719314
1   202506050805   2  0.592404
6   202506050805   7  0.468350
0   202506050805   1  0.351470
15  202506050805  16  0.344119
3   202506050805   4  0.255851
7   202506050805   8  0.140749
4   202506050805   5  0.117439
5   202506050805   6  0.075398
9   202506050805  10  0.074279
2   202506050805   3 -0.103034
10  202506050805  11 -0.587688
12  202506050805  13 -0.854390
14  202506050805  15 -3.315271


## 6.4. レース直前データ処理（旧方式）

In [None]:
filepath = 'data/tmp/shutuba.pickle' #一時的に出馬表を保存するパスを指定
preparing.scrape_shutuba_table(race_id_list[0], '2025/9/21', filepath) #馬体重が発表されたら、出馬表を取得
shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath) #出馬表の加工

In [None]:
#テーブルのマージ
shutuba_data_merger = preprocessing.ShutubaDataMerger(
    shutuba_table_processor,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS
)

shutuba_data_merger.merge()

In [None]:
#特徴量エンジニアリング
feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

In [None]:
# 予測（学習列に完全整列＋NaN防止）
from modules.constants import ResultsCols
import numpy as np

# 1) 学習で実際に使った列を取得
train_cols = keiba_ai.datasets.X_train.columns

# 2) 予測用特徴量（date/rankは除外）
X_feat = feature_enginnering_shutuba.featured_data.drop(['date', 'rank'], axis=1, errors='ignore')

# 3) 学習列に揃える（不足は0、余剰は落とす）
X_feat = X_feat.reindex(columns=train_cols, fill_value=0)

# 4) 数値化とNaN/inf対策
for c in X_feat.columns:
    if getattr(X_feat[c].dtype, 'name', '') == 'category':
        X_feat[c] = X_feat[c].cat.codes
X_feat = X_feat.astype(float).replace([np.inf, -np.inf], 0).fillna(0)

# 5) 表示用に馬番を付与（ポリシー側で自動除外）
X_for_policy = X_feat.copy()
if ResultsCols.UMABAN in feature_enginnering_shutuba.featured_data.columns:
    X_for_policy[ResultsCols.UMABAN] = feature_enginnering_shutuba.featured_data[ResultsCols.UMABAN].values

# 6) 予測
score_result = keiba_ai.calc_score(X_for_policy, policies.StdScorePolicy).sort_values('score', ascending=False)
score_result.head()

## 付録
騎手勝率無し VS 有りの比較

In [None]:
old_returns_df = pd.read_pickle('models/20251226/tansho_no_jockey_std_0_3p5.pickle')



# old_returns_df と returns_df の結果を重ねてプロットして比較

plot_single_threshold_compare(

    old_returns_df, returns_df, N_SAMPLES,

    label1='no_jockey(std,0-3.5)', label2='with_jockey(std,0-3.5)'

)




In [None]:
# n_bets / n_races がどの閾値から崩れるか確認（特徴量なし vs あり）

import matplotlib.pyplot as plt



def _plot_counts(df, label, ax_bets, ax_races):

    ax_bets.plot(df.index, df['n_bets'], label=label)

    ax_races.plot(df.index, df['n_races'], label=label)



fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 6), dpi=100, sharex=True)



_plot_counts(old_returns_df, 'no_jockey', ax1, ax2)

_plot_counts(returns_df, 'with_jockey', ax1, ax2)



ax1.set_ylabel('n_bets')

ax1.grid(True)

ax1.legend()



ax2.set_ylabel('n_races')

ax2.set_xlabel('threshold')

ax2.grid(True)

ax2.legend()



plt.show()



print('--- tail(10): no_jockey ---')

display(old_returns_df.tail(10)[['n_bets','n_races','return_rate']])

print('--- tail(10): with_jockey ---')

display(returns_df.tail(10)[['n_bets','n_races','return_rate']])


In [None]:
# n_races >= 100 に限定した return_rate の最大値（特徴量なし vs あり）

import numpy as np



MIN_RACES = 100



def best_under_constraint(df, min_races: int):

    d = df[df['n_races'] >= min_races].copy()

    if len(d) == 0:

        return None, None, d

    best_thr = float(d['return_rate'].idxmax())

    best_rr = float(d.loc[best_thr, 'return_rate'])

    return best_thr, best_rr, d



thr0, rr0, d0 = best_under_constraint(old_returns_df, MIN_RACES)

thr1, rr1, d1 = best_under_constraint(returns_df, MIN_RACES)



print(f'MIN_RACES = {MIN_RACES}')

print('--- no_jockey ---')

if thr0 is None:

    print('条件を満たすthresholdがありません')

else:

    print('best threshold:', thr0)

    print('best return_rate:', rr0)

    display(old_returns_df.loc[[thr0], ['n_bets','n_races','return_rate','std']])



print('--- with_jockey ---')

if thr1 is None:

    print('条件を満たすthresholdがありません')

else:

    print('best threshold:', thr1)

    print('best return_rate:', rr1)

    display(returns_df.loc[[thr1], ['n_bets','n_races','return_rate','std']])



# 参考: 上位5件も表示

if len(d0) > 0:

    print('top5(no_jockey)')

    display(d0.sort_values('return_rate', ascending=False).head(5)[['n_bets','n_races','return_rate','std']])

if len(d1) > 0:

    print('top5(with_jockey)')

    display(d1.sort_values('return_rate', ascending=False).head(5)[['n_bets','n_races','return_rate','std']])


In [None]:
# 上位帯の安定性チェック: MIN_RACES を変えてベストを比較

import pandas as pd



MIN_RACES_LIST = [100, 200, 500]



def best_row(df: pd.DataFrame, min_races: int, label: str) -> dict:

    d = df[df['n_races'] >= min_races]

    if len(d) == 0:

        return {

            'model': label,

            'min_races': min_races,

            'best_threshold': None,

            'best_return_rate': None,

            'n_races': 0,

            'n_bets': 0,

            'std': None,

        }

    thr = float(d['return_rate'].idxmax())

    row = df.loc[thr]

    return {

        'model': label,

        'min_races': min_races,

        'best_threshold': thr,

        'best_return_rate': float(row['return_rate']),

        'n_races': int(row['n_races']),

        'n_bets': int(row['n_bets']),

        'std': float(row['std']),

    }



rows = []

for m in MIN_RACES_LIST:

    rows.append(best_row(old_returns_df, m, 'no_jockey'))

    rows.append(best_row(returns_df, m, 'with_jockey'))



stability_df = pd.DataFrame(rows).sort_values(['min_races', 'model']).reset_index(drop=True)

display(stability_df)


In [None]:
# 運用向けチェック: min_races=500 の範囲で return_rate > 1.0 は存在するか

import pandas as pd



MIN_RACES_OPS = 500

RR_TARGET = 1.0



def points_over_1(df: pd.DataFrame, label: str, min_races: int, rr_target: float):

    d = df[df['n_races'] >= min_races].copy()

    over = d[d['return_rate'] > rr_target].copy()

    print(f'[{label}] min_races>={min_races} の点数: {len(d)}')

    print(f'[{label}] return_rate>{rr_target} の点数: {len(over)}')

    if len(over) == 0:

        return

    print(f'[{label}] threshold 範囲: {float(over.index.min())} 〜 {float(over.index.max())}')

    display(over.sort_values('return_rate', ascending=False).head(10)[['n_bets','n_races','return_rate','std']])



points_over_1(old_returns_df, 'no_jockey', MIN_RACES_OPS, RR_TARGET)

points_over_1(returns_df, 'with_jockey', MIN_RACES_OPS, RR_TARGET)


In [None]:
# === 旧モデル(特徴量なし)を新仕様(StdScorePolicy)で再計算して保存 ===

import os

import traceback

import pandas as pd

from tqdm import tqdm



from modules import training, policies, preprocessing, simulation

from modules.constants import LocalPaths



# 旧モデル（特徴量なし）をロード

keiba_ai_no_jockey = training.KeibaAIFactory.load('models/20251223/basemodel_2020_2025.pickle')

keiba_ai_no_jockey.set_params(keiba_ai_no_jockey.get_params())



# Simulator / ReturnProcessor が未準備なら用意

try:

    simulator

except NameError:

    return_processor = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)

    simulator = simulation.Simulator(return_processor)



# 0.0〜3.5 を両端含めてスイープ

T_RANGE_OLD = [0.0, 3.5]

N_SAMPLES_OLD = 100



score_table_old = keiba_ai_no_jockey.calc_score(keiba_ai_no_jockey.datasets.X_test, policies.StdScorePolicy)



returns_old = {}

for i in tqdm(range(N_SAMPLES_OLD)):

    if N_SAMPLES_OLD > 1:

        threshold = T_RANGE_OLD[0] + (T_RANGE_OLD[1] - T_RANGE_OLD[0]) * i / (N_SAMPLES_OLD - 1)

    else:

        threshold = T_RANGE_OLD[0]

    try:

        actions_old = keiba_ai_no_jockey.decide_action(

            score_table_old,

            policies.BetPolicyTansho,

            threshold=threshold,

        )

        returns_old[threshold] = simulator.calc_returns(actions_old)

    except Exception:

        traceback.print_exc()

        break



returns_old_df = pd.DataFrame.from_dict(returns_old, orient='index').sort_index()

returns_old_df.index.name = 'threshold'



os.makedirs('models/20251226', exist_ok=True)

returns_old_path = 'models/20251226/tansho_no_jockey_std_0_3p5.pickle'

returns_old_df.to_pickle(returns_old_path)



print('saved:', returns_old_path)

print('index min/max/len:', float(returns_old_df.index.min()), float(returns_old_df.index.max()), len(returns_old_df))

returns_old_df.head()


## 6.5. 過去日（2025/12/21）の当日予想→券種別回収率シミュレーション

このセクションは 6.3 の「当日予想」セルと同じ処理（出馬表→結合→特徴量→スコア）を、**過去日**の指定 race_id に対して実行し、指定ルールで馬券を買ったと仮定した回収率を計算します。

注意:
- `DataMerger` 側の `date < 対象日` フィルタ（馬の過去成績集計）に依存してリークを避けます。
- ただし **使用モデルが対象日のデータを学習に含んでいる場合**、評価は楽観的になり得ます。
- 払戻テーブル（return_tables）に race_id が無いと、そのレースは集計から除外されます。

In [7]:
import os
import time
import math
import numpy as np
import pandas as pd

from modules import preparing, preprocessing, policies, training, simulation
from modules.constants import LocalPaths, ResultsCols

# --- 対象レース（2025/12/21 全12R x 3開催 = 36レース） ---
SIM_DATE_STR = '2025/12/21'  # scrape_shutuba_table の date 引数（yyyy/mm/dd）
BASE_RACE_IDS = [
    '202506050601',
    '202509050601',
    '202507050601',
]

race_id_list = []
for base in BASE_RACE_IDS:
    prefix = base[:-2]  # 末尾"01"を除いた部分
    race_id_list.extend([prefix + f'{i:02d}' for i in range(1, 13)])
race_id_list = sorted(set(race_id_list))
print('race_id_list size:', len(race_id_list))

# --- 使用モデル ---
MODEL_PATH = 'models/20251226/basemodel_2020_2025.pickle'
if not os.path.exists(MODEL_PATH):
    # 実在する basemodel を自動選択（ファイル名が変わっても動くようにする）
    candidates = [
        os.path.join('models', '20251226', f)
        for f in os.listdir(os.path.join('models', '20251226'))
        if f.startswith('basemodel_') and f.endswith('.pickle')
    ]
    if len(candidates) == 0:
        raise FileNotFoundError('basemodel_*.pickle が models/20251226 に見つかりません')
    MODEL_PATH = sorted(candidates)[-1]
    print('[WARN] 指定モデルが無いため自動選択:', MODEL_PATH)

keiba_ai = training.KeibaAIFactory.load(MODEL_PATH)
score_policy = policies.StdScorePolicy

# --- 前処理済みテーブル（最新 raw を使用） ---
horse_results_processor = preprocessing.HorseResultsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
horse_info_processor   = preprocessing.HorseInfoProcessor(filepath=LocalPaths.RAW_HORSE_INFO_PATH)
peds_processor         = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)
return_processor       = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)
simulator              = simulation.Simulator(return_processor)

# 6章の既存変数が無い場合は最低限の空で進める（特徴量が減るだけ）
if 'TARGET_COLS' not in globals():
    TARGET_COLS = []
    print('[WARN] TARGET_COLS が未定義なので空で進めます（特徴量が減ります）。')
if 'GROUP_COLS' not in globals():
    GROUP_COLS = []
    print('[WARN] GROUP_COLS が未定義なので空で進めます（特徴量が減ります）。')

race_id_list size: 36


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


In [8]:
# --- 1) 出馬表スクレイピング（36レース） ---
out_dir = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20251221')
os.makedirs(out_dir, exist_ok=True)

created = 0
skipped = 0
for rid in race_id_list:
    out_path = os.path.join(out_dir, f'{rid}.pickle')
    if os.path.exists(out_path):
        skipped += 1
        continue
    time.sleep(1)  # サーバー負荷軽減
    preparing.scrape_shutuba_table(rid, SIM_DATE_STR, out_path)
    created += 1

print('scrape done. created=', created, 'skipped(existing)=', skipped, 'dir=', out_dir)

scrape done. created= 0 skipped(existing)= 36 dir= c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\tmp\shutuba_20251221


In [9]:
# --- 2) 出馬表pickleを結合 → 前処理 → マージ → 特徴量 ---
paths = [os.path.join(out_dir, f'{rid}.pickle') for rid in race_id_list if os.path.exists(os.path.join(out_dir, f'{rid}.pickle'))]
print('available shutuba pickles:', len(paths), '/', len(race_id_list))

if len(paths) == 0:
    raise RuntimeError('出馬表pickleが1件もありません。先にスクレイピングセルを実行してください。')

raw_list = [pd.read_pickle(p) for p in paths]
shutuba_raw = pd.concat(raw_list, axis=0, ignore_index=False)
shutuba_all_path = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20251221_all.pickle')
shutuba_raw.to_pickle(shutuba_all_path)
print('saved:', shutuba_all_path, 'rows=', len(shutuba_raw))

# 出馬表の加工（race_idメタ列を保持するようにProcessor側は修正済み）
shutuba_table_processor = preprocessing.ShutubaTableProcessor(shutuba_all_path)

# テーブルのマージ
shutuba_data_merger = preprocessing.ShutubaDataMerger(
    shutuba_table_processor,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS,
 )
shutuba_data_merger.merge()

# 特徴量エンジニアリング
feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

X_shutuba = feature_enginnering_shutuba.featured_data
print('X_shutuba shape:', X_shutuba.shape)

available shutuba pickles: 36 / 36
saved: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\tmp\shutuba_20251221_all.pickle rows= 544
ShutubaTableProcessor: 馬番クリーンアップ開始（544件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

X_shutuba shape: (544, 280)


In [14]:
# --- 2.5) コード修正を反映（モジュールreload） ---
import importlib
import modules.policies._score_policy as _score_policy_mod
import modules.policies as _policies_mod

importlib.reload(_score_policy_mod)
importlib.reload(_policies_mod)

from modules import policies as policies  # 再import
score_policy = policies.StdScorePolicy
print('reloaded policies._score_policy')

reloaded policies._score_policy


In [15]:
# --- 3) スコア算出（レース内標準化） ---
score_table_20251221 = keiba_ai.calc_score(X_shutuba, score_policy)
print('score_table shape:', score_table_20251221.shape)
display(score_table_20251221.head())

# 便利カラム（型）を整える
if ResultsCols.UMABAN in score_table_20251221.columns:
    score_table_20251221[ResultsCols.UMABAN] = pd.to_numeric(score_table_20251221[ResultsCols.UMABAN], errors='coerce').astype('Int64')
score_table_20251221['race_id'] = score_table_20251221['race_id'].astype(str)

score_table shape: (544, 3)


Unnamed: 0,race_id,馬番,score
0,202506050601,1,0.017558
1,202506050601,2,0.017558
2,202506050601,3,0.017558
3,202506050601,4,0.017558
4,202506050601,5,0.017558


In [16]:
# --- 4) 券種別ルールで actions を作る ---
def _top_umaban(df_1race: pd.DataFrame, n: int) -> list[int]:
    df = df_1race.sort_values('score', ascending=False)
    uma = df[ResultsCols.UMABAN].dropna().astype(int).tolist()
    return uma[:n]

def build_actions_by_ticket(score_table: pd.DataFrame) -> dict[str, dict]:
    actions_by_ticket: dict[str, dict] = {
        'tansho': {},
        'fukusho': {},
        'umaren': {},
        'umatan': {},
        'wide': {},
        'sanrenpuku': {},
        'sanrentan': {},
    }
    for rid, df_r in score_table.groupby('race_id'):
        n_horses = len(df_r)
        # ルールに従う（買い目が成立しない場合は空にする）
        top2 = _top_umaban(df_r, 2) if n_horses >= 1 else []
        top3 = _top_umaban(df_r, 3) if n_horses >= 1 else []
        top4 = _top_umaban(df_r, 4) if n_horses >= 1 else []
        if n_horses <= 13:
            topN_tri = max(3, (n_horses + 1) // 2)  # 上位半分（切り上げ）
        else:
            topN_tri = 7
        top_tri = _top_umaban(df_r, topN_tri) if n_horses >= 1 else []

        actions_by_ticket['tansho'][rid] = {'tansho': top2}
        actions_by_ticket['fukusho'][rid] = {'fukusho': top3}
        actions_by_ticket['umaren'][rid] = {'umaren': top4 if len(top4) >= 2 else []}
        actions_by_ticket['umatan'][rid] = {'umatan': top4 if len(top4) >= 2 else []}
        actions_by_ticket['wide'][rid] = {'wide': top4 if len(top4) >= 2 else []}
        actions_by_ticket['sanrenpuku'][rid] = {'sanrenpuku': top_tri if len(top_tri) >= 3 else []}
        actions_by_ticket['sanrentan'][rid] = {'sanrentan': top_tri if len(top_tri) >= 3 else []}
    return actions_by_ticket

actions_by_ticket = build_actions_by_ticket(score_table_20251221)
print('tickets:', list(actions_by_ticket.keys()))
print('races in actions:', len(actions_by_ticket['tansho']))

tickets: ['tansho', 'fukusho', 'umaren', 'umatan', 'wide', 'sanrenpuku', 'sanrentan']
races in actions: 36


In [17]:
# --- 5) 券種別に回収率を集計（払戻テーブルに無いrace_idは自動スキップ） ---
rows = []
detail_by_ticket = {}
for ticket, actions_ticket in actions_by_ticket.items():
    returns_per_race = simulator.calc_returns_per_race(actions_ticket)
    returns = simulator.calc_returns(actions_ticket)
    detail_by_ticket[ticket] = returns_per_race.sort_index()
    skipped_races = len(actions_ticket) - returns_per_race.index.nunique()
    rows.append({
        'ticket': ticket,
        'n_races_target': len(actions_ticket),
        'n_races_in_return_tables': returns_per_race.index.nunique(),
        'n_races_skipped': skipped_races,
        **returns,
    })

summary_20251221 = pd.DataFrame(rows).sort_values('ticket').reset_index(drop=True)
display(summary_20251221)

# 例: 単勝のレース別明細
display(detail_by_ticket['tansho'].head())

Unnamed: 0,ticket,n_races_target,n_races_in_return_tables,n_races_skipped,n_bets,n_races,n_hits,total_bet_amount,return_rate,std
0,fukusho,36,36,0,108,36,23,108,0.802778,0.146652
1,sanrenpuku,36,36,0,1160,36,2,1160,0.066466,0.046758
2,sanrentan,36,36,0,6960,36,2,6960,0.069943,0.055993
3,tansho,36,36,0,72,36,8,72,0.626389,0.238018
4,umaren,36,36,0,216,36,6,216,0.714815,0.360902
5,umatan,36,36,0,432,36,6,432,0.741435,0.427189
6,wide,36,36,0,216,36,8,216,0.415278,0.165527


Unnamed: 0,n_bets,bet_amount,return_amount,hit_or_not
202506050601,2,2,0.0,0
202506050602,2,2,0.0,0
202506050603,2,2,0.0,0
202506050604,2,2,0.0,0
202506050605,2,2,0.0,0


In [18]:
# --- 6.5) 払戻（return_tables）欠損の補完（欠損race_idのみ取得） ---
import pandas as pd

from modules.constants import LocalPaths
from modules.preparing._scrape_html import scrape_html_race
from modules.preparing._get_rawdata import get_rawdata_return, update_rawdata
from modules.preprocessing._return_processor import ReturnProcessor
from modules.simulation._simulator import Simulator

# return_tables に存在する race_id を抽出（MultiIndexにも対応）
raw_return_tables = return_processor.raw_data
if getattr(raw_return_tables.index, 'nlevels', 1) > 1:
    existing_race_ids = set(raw_return_tables.index.get_level_values(0).astype(str))
else:
    existing_race_ids = set(raw_return_tables.index.astype(str))

missing_race_ids = sorted(set(map(str, race_id_list)) - existing_race_ids)
print(f'missing race_id in return_tables: {len(missing_race_ids)}')
if len(missing_race_ids) > 0:
    display(pd.Series(missing_race_ids, name='missing_race_id').head(20))

    # 1) race html 取得（欠損分のみ）
    updated_html_paths = scrape_html_race(missing_race_ids, skip=False)

    # 2) raw return_tables 作成→既存pickleへ追記
    new_return_df = get_rawdata_return(updated_html_paths)
    _ = update_rawdata(LocalPaths.RAW_RETURN_TABLES_PATH, new_return_df, mode='update')

    # 3) ReturnProcessor/Simulator を作り直し
    return_processor = ReturnProcessor(LocalPaths.RAW_RETURN_TABLES_PATH)
    simulator = Simulator(return_processor)

    print('return_tables updated. 再集計したい場合は、直前の回収率集計セル（6.5）を再実行してください。')
else:
    print('欠損はありません（このまま回収率集計結果を採用できます）。')


missing race_id in return_tables: 0
欠損はありません（このまま回収率集計結果を採用できます）。


## 6.6. 過去日（2025/12/20）の当日予想→券種別回収率シミュレーション

- 対象: 2025/12/20（土）
- race_id: 202506050501~12 / 202509050501~12 / 202507050501~12（合計36R）
- 6.5 と同じルールで actions を生成して回収率を集計

In [19]:
# --- 0) 対象レース設定（2025/12/20 全12R x 3開催 = 36レース） ---
import os
import time
import numpy as np
import pandas as pd

from modules import preparing, preprocessing, policies, training, simulation
from modules.constants import LocalPaths, ResultsCols

SIM_DATE_STR_20251220 = '2025/12/20'  # scrape_shutuba_table の date 引数（yyyy/mm/dd）
BASE_RACE_IDS_20251220 = [
    '202506050501',
    '202509050501',
    '202507050501',
]

race_id_list_20251220: list[str] = []
for base in BASE_RACE_IDS_20251220:
    prefix = base[:-2]
    race_id_list_20251220.extend([prefix + f'{i:02d}' for i in range(1, 13)])
race_id_list_20251220 = sorted(set(race_id_list_20251220))
print('race_id_list_20251220 size:', len(race_id_list_20251220))

# 6.5を実行していない環境でも動くように最低限を初期化
if 'keiba_ai' not in globals():
    MODEL_PATH = 'models/20251226/basemodel_2020_2025.pickle'
    if not os.path.exists(MODEL_PATH):
        candidates = [
            os.path.join('models', '20251226', f)
            for f in os.listdir(os.path.join('models', '20251226'))
            if f.startswith('basemodel_') and f.endswith('.pickle')
        ]
        if len(candidates) == 0:
            raise FileNotFoundError('basemodel_*.pickle が models/20251226 に見つかりません')
        MODEL_PATH = sorted(candidates)[-1]
        print('[WARN] 指定モデルが無いため自動選択:', MODEL_PATH)

    keiba_ai = training.KeibaAIFactory.load(MODEL_PATH)

if 'score_policy' not in globals():
    score_policy = policies.StdScorePolicy

# raw processors / simulator
horse_results_processor = preprocessing.HorseResultsProcessor(filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
horse_info_processor   = preprocessing.HorseInfoProcessor(filepath=LocalPaths.RAW_HORSE_INFO_PATH)
peds_processor         = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)
return_processor       = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)
simulator              = simulation.Simulator(return_processor)

if 'TARGET_COLS' not in globals():
    TARGET_COLS = []
    print('[WARN] TARGET_COLS が未定義なので空で進めます（特徴量が減ります）。')
if 'GROUP_COLS' not in globals():
    GROUP_COLS = []
    print('[WARN] GROUP_COLS が未定義なので空で進めます（特徴量が減ります）。')

out_dir_20251220 = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20251220')
os.makedirs(out_dir_20251220, exist_ok=True)
print('out_dir_20251220:', out_dir_20251220)


race_id_list_20251220 size: 36


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


out_dir_20251220: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\tmp\shutuba_20251220


In [20]:
# --- 1) 出馬表スクレイピング（36レース） ---
created = 0
skipped = 0
for rid in race_id_list_20251220:
    out_path = os.path.join(out_dir_20251220, f'{rid}.pickle')
    if os.path.exists(out_path):
        skipped += 1
        continue
    time.sleep(1)  # サーバー負荷軽減
    preparing.scrape_shutuba_table(rid, SIM_DATE_STR_20251220, out_path)
    created += 1

print('scrape done. created=', created, 'skipped(existing)=', skipped, 'dir=', out_dir_20251220)


スクレイピング完了 - レース202506050501: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'リスペクトライト', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'リスペクトライト', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['リスペクトライト', '']
  除去レコード 202506050501: 馬番='リスペクトライト', 体重='nan'
  除去レコード 202506050501: 馬番='', 体重='nan'
スクレイピング完了 - レース202506050502: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ブルーサーマル', '']
クリーンアップ前の馬番: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 'ブルーサーマル', '']
scrape_shutuba_table: 2件の不正な馬番レコードを除去しました
除去された馬番: ['ブルーサーマル', '']
  除去レコード 202506050502: 馬番='ブルーサーマル', 体重='nan'
  除去レコード 202506050502: 馬番='', 体重='nan'
スクレイピング完了 - レース202506050503: 18頭立て
生データの列数: 18
馬番列（index=1）の値: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '

In [21]:
# --- 2) 出馬表pickleを結合 → 前処理 → マージ → 特徴量 ---
paths = [
    os.path.join(out_dir_20251220, f'{rid}.pickle')
    for rid in race_id_list_20251220
    if os.path.exists(os.path.join(out_dir_20251220, f'{rid}.pickle'))
]
print('available shutuba pickles:', len(paths), '/', len(race_id_list_20251220))

if len(paths) == 0:
    raise RuntimeError('出馬表pickleが1件もありません。先にスクレイピングセルを実行してください。')

raw_list = [pd.read_pickle(p) for p in paths]
shutuba_raw_20251220 = pd.concat(raw_list, axis=0, ignore_index=False)
shutuba_all_path_20251220 = os.path.join(LocalPaths.TMP_DIR, 'shutuba_20251220_all.pickle')
shutuba_raw_20251220.to_pickle(shutuba_all_path_20251220)
print('saved:', shutuba_all_path_20251220, 'rows=', len(shutuba_raw_20251220))

shutuba_table_processor_20251220 = preprocessing.ShutubaTableProcessor(shutuba_all_path_20251220)

shutuba_data_merger_20251220 = preprocessing.ShutubaDataMerger(
    shutuba_table_processor_20251220,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS,
)
shutuba_data_merger_20251220.merge()

feature_enginnering_shutuba_20251220 = preprocessing.FeatureEngineering(shutuba_data_merger_20251220)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

X_shutuba_20251220 = feature_enginnering_shutuba_20251220.featured_data
print('X_shutuba_20251220 shape:', X_shutuba_20251220.shape)


available shutuba pickles: 36 / 36
saved: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\tmp\shutuba_20251220_all.pickle rows= 545
ShutubaTableProcessor: 馬番クリーンアップ開始（545件のレコード）
ShutubaTableProcessor: すべての馬番が有効です
separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

X_shutuba_20251220 shape: (545, 280)


In [22]:
# --- 3) スコア算出（レース内標準化） ---
score_table_20251220 = keiba_ai.calc_score(X_shutuba_20251220, score_policy)
print('score_table_20251220 shape:', score_table_20251220.shape)
display(score_table_20251220.head())

if ResultsCols.UMABAN in score_table_20251220.columns:
    score_table_20251220[ResultsCols.UMABAN] = pd.to_numeric(score_table_20251220[ResultsCols.UMABAN], errors='coerce').astype('Int64')
score_table_20251220['race_id'] = score_table_20251220['race_id'].astype(str)


score_table_20251220 shape: (545, 3)


Unnamed: 0,race_id,馬番,score
0,202506050501,1,0.017558
1,202506050501,2,0.017558
2,202506050501,3,0.017558
3,202506050501,4,0.017558
4,202506050501,5,0.017558


In [23]:
# --- 4) 券種別ルールで actions を作る → 回収率を集計 ---
# 6.5で定義済みなら再利用、無ければここで定義
if 'build_actions_by_ticket' not in globals():
    def _top_umaban(df_1race: pd.DataFrame, n: int) -> list[int]:
        df = df_1race.sort_values('score', ascending=False)
        uma = df[ResultsCols.UMABAN].dropna().astype(int).tolist()
        return uma[:n]

    def build_actions_by_ticket(score_table: pd.DataFrame) -> dict[str, dict]:
        actions_by_ticket: dict[str, dict] = {
            'tansho': {},
            'fukusho': {},
            'umaren': {},
            'umatan': {},
            'wide': {},
            'sanrenpuku': {},
            'sanrentan': {},
        }
        for rid, df_r in score_table.groupby('race_id'):
            n_horses = len(df_r)
            top2 = _top_umaban(df_r, 2) if n_horses >= 1 else []
            top3 = _top_umaban(df_r, 3) if n_horses >= 1 else []
            top4 = _top_umaban(df_r, 4) if n_horses >= 1 else []
            if n_horses <= 13:
                topN_tri = max(3, (n_horses + 1) // 2)
            else:
                topN_tri = 7
            top_tri = _top_umaban(df_r, topN_tri) if n_horses >= 1 else []

            actions_by_ticket['tansho'][rid] = {'tansho': top2}
            actions_by_ticket['fukusho'][rid] = {'fukusho': top3}
            actions_by_ticket['umaren'][rid] = {'umaren': top4 if len(top4) >= 2 else []}
            actions_by_ticket['umatan'][rid] = {'umatan': top4 if len(top4) >= 2 else []}
            actions_by_ticket['wide'][rid] = {'wide': top4 if len(top4) >= 2 else []}
            actions_by_ticket['sanrenpuku'][rid] = {'sanrenpuku': top_tri if len(top_tri) >= 3 else []}
            actions_by_ticket['sanrentan'][rid] = {'sanrentan': top_tri if len(top_tri) >= 3 else []}
        return actions_by_ticket

actions_by_ticket_20251220 = build_actions_by_ticket(score_table_20251220)
print('tickets:', list(actions_by_ticket_20251220.keys()))
print('races in actions:', len(actions_by_ticket_20251220['tansho']))

rows = []
detail_by_ticket_20251220 = {}
for ticket, actions_ticket in actions_by_ticket_20251220.items():
    returns_per_race = simulator.calc_returns_per_race(actions_ticket)
    returns = simulator.calc_returns(actions_ticket)
    detail_by_ticket_20251220[ticket] = returns_per_race.sort_index()
    skipped_races = len(actions_ticket) - returns_per_race.index.nunique()
    rows.append({
        'ticket': ticket,
        'n_races_target': len(actions_ticket),
        'n_races_in_return_tables': returns_per_race.index.nunique(),
        'n_races_skipped': skipped_races,
        **returns,
    })

summary_20251220 = pd.DataFrame(rows).sort_values('ticket').reset_index(drop=True)
display(summary_20251220)


tickets: ['tansho', 'fukusho', 'umaren', 'umatan', 'wide', 'sanrenpuku', 'sanrentan']
races in actions: 36


Unnamed: 0,ticket,n_races_target,n_races_in_return_tables,n_races_skipped,n_bets,n_races,n_hits,total_bet_amount,return_rate,std
0,fukusho,36,36,0,108,36,20,108,0.705556,0.194109
1,sanrenpuku,36,36,0,1149,36,4,1149,0.399826,0.285104
2,sanrentan,36,36,0,6894,36,4,6894,0.428706,0.345839
3,tansho,36,36,0,72,36,6,72,0.506944,0.264291
4,umaren,36,36,0,216,36,4,216,1.369444,1.304301
5,umatan,36,36,0,432,36,4,432,1.302315,1.218587
6,wide,36,36,0,216,36,6,216,0.467593,0.319858


In [24]:
# --- 5) 払戻（return_tables）欠損の確認＆必要なら補完 ---
import pandas as pd

from modules.constants import LocalPaths
from modules.preparing._scrape_html import scrape_html_race
from modules.preparing._get_rawdata import get_rawdata_return, update_rawdata
from modules.preprocessing._return_processor import ReturnProcessor
from modules.simulation._simulator import Simulator

raw_return_tables = return_processor.raw_data
if getattr(raw_return_tables.index, 'nlevels', 1) > 1:
    existing_race_ids = set(raw_return_tables.index.get_level_values(0).astype(str))
else:
    existing_race_ids = set(raw_return_tables.index.astype(str))

missing_race_ids_20251220 = sorted(set(map(str, race_id_list_20251220)) - existing_race_ids)
print(f'missing race_id in return_tables (20251220): {len(missing_race_ids_20251220)}')
if len(missing_race_ids_20251220) > 0:
    display(pd.Series(missing_race_ids_20251220, name='missing_race_id').head(20))

    updated_html_paths = scrape_html_race(missing_race_ids_20251220, skip=False)
    new_return_df = get_rawdata_return(updated_html_paths)
    _ = update_rawdata(LocalPaths.RAW_RETURN_TABLES_PATH, new_return_df, mode='update')

    return_processor = ReturnProcessor(LocalPaths.RAW_RETURN_TABLES_PATH)
    simulator = Simulator(return_processor)

    print('return_tables updated. 必要なら、上の回収率集計セルを再実行してください。')
else:
    print('欠損はありません（このまま回収率集計結果を採用できます）。')


missing race_id in return_tables (20251220): 0
欠損はありません（このまま回収率集計結果を採用できます）。
