# 1. モジュールインポート

In [2]:
import pandas as pd
import glob
import os
import datetime
from tqdm.auto import tqdm
from modules.constants import Master
from modules.constants import LocalPaths
from modules.constants import HorseResultsCols
from modules.constants import ResultsCols
from modules import preparing
from modules import preprocessing
from modules import training
from modules import simulation
from modules import policies
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


標準的な土日競馬開催時の運用スケジュールを以下の表の通り。

|曜日|時刻|内容|実行する main.ipynb の項番|備考|
|:-:|:--|:--|:--|:--|
|月|||||
|火|||||
|水|16:30過ぎ|先週土日の馬の過去成績ページ確定<BR>（netkeiba.comﾌﾟﾚﾐｱｻｰﾋﾞｽのﾀｲﾑ指数・ﾚｰｽ分析・注目馬 ﾚｰｽ後の短評情報確定）|2. データ取得 ～ 5. シミュレーション|3日間開催の場合も、水曜日|
|木|||||
|金|10:05過ぎ<BR>19:25過ぎ|土曜の出馬表確定<BR>土曜の天候・馬場状態更新|6.1. 前日準備 ～ 6.2. 前日全レース予想（天候・馬場状態は手動設定）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想||
|土|09:00～17:00<BR>10:05過ぎ<BR>19:25過ぎ| レース時刻<BR>日曜の出馬表確定<BR>日曜の天候・馬場状態更新|6.3. レース直前データ処理（当日レース予想）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想（天候・馬場状態は手動設定）<BR>6.1. 前日準備 ～ 6.2. 前日全レース予想||
|日|09:00～17:00|レース時刻|6.3. レース直前データ処理（当日レース予想）||

# 2. データ取得

## 2.1. レースID取得
例として、2020年のレースデータを取得する場合を考える

In [3]:
%autoreload

In [4]:
#開催日取得。to_の月は含まないので注意。
kaisai_date_2025 = preparing.scrape_kaisai_date(from_="2023-01-01", to_="2025-10-01")
len(kaisai_date_2025)

getting race date from 2023-01-01 to 2025-10-01


  date_range = pd.date_range(start=from_, end=to_, freq="M")


  0%|          | 0/33 [00:00<?, ?it/s]

293

In [5]:
# 開催日からレースIDの取得
race_id_list = preparing.scrape_race_id_list(kaisai_date_2025)
len(race_id_list)

getting race_id_list


  0%|          | 0/293 [00:00<?, ?it/s]

scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230105
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230107
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230108
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230109
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230114
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230115
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230121
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230122
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230128
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230129
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230204
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230205
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230211
scraping: ht

9478

## 2.2. /race/ディレクトリのデータ取得

In [6]:
#https://db.netkeiba.com/race/のhtml(binファイル)をスクレイピングして保存
html_files_race = preparing.scrape_html_race(race_id_list, skip=True)
html_files_race[:5]

  0%|          | 0/9478 [00:00<?, ?it/s]

race_id 202306010101 skipped
race_id 202306010102 skipped
race_id 202306010103 skipped
race_id 202306010104 skipped
race_id 202306010105 skipped
race_id 202306010106 skipped
race_id 202306010107 skipped
race_id 202306010108 skipped
race_id 202306010109 skipped
race_id 202306010110 skipped
race_id 202306010111 skipped
race_id 202306010112 skipped
race_id 202307010101 skipped
race_id 202307010102 skipped
race_id 202307010103 skipped
race_id 202307010104 skipped
race_id 202307010105 skipped
race_id 202307010106 skipped
race_id 202307010107 skipped
race_id 202307010108 skipped
race_id 202307010109 skipped
race_id 202307010110 skipped
race_id 202307010111 skipped
race_id 202307010112 skipped
race_id 202306010201 skipped
race_id 202306010202 skipped
race_id 202306010203 skipped
race_id 202306010204 skipped
race_id 202306010205 skipped
race_id 202306010206 skipped
race_id 202306010207 skipped
race_id 202306010208 skipped
race_id 202306010209 skipped
race_id 202306010210 skipped
race_id 202306

[]

In [7]:
# data/html/race/に保存されているhtml(binファイル)をリストにする
import glob
import os

# LocalPathsからHTMLレースディレクトリを取得
race_html_dir = LocalPaths.HTML_RACE_DIR
print(f"レースHTMLディレクトリ: {race_html_dir}")

# globでbinファイルを検索
html_files_race = glob.glob(os.path.join(race_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_race)}")

# 最初の5ファイルを表示
html_files_race[:5]
html_files_race[:5]

レースHTMLディレクトリ: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race
見つかったHTMLファイル数: 9478


['c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\race\\202301010101.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\race\\202301010102.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\race\\202301010103.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\race\\202301010104.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\race\\202301010105.bin']

In [8]:
# 変数確認
print(f"html_files_race変数の状態:")
print(f"  タイプ: {type(html_files_race)}")
print(f"  サイズ: {len(html_files_race)}")
print(f"  範囲: {html_files_race[0].split('\\')[-1]} ～ {html_files_race[-1].split('\\')[-1]}")

# これで次のセルでget_rawdata_results関数を正常に実行できます

html_files_race変数の状態:
  タイプ: <class 'list'>
  サイズ: 9478
  範囲: 202301010101.bin ～ 202510020812.bin


In [9]:
results_new = preparing.get_rawdata_results(html_files_race) #レース結果テーブルの作成
race_info_new = preparing.get_rawdata_info(html_files_race) #レース情報テーブルの作成
return_tables_new = preparing.get_rawdata_return(html_files_race) #払戻テーブルの作成

preparing raw results table


  0%|          | 0/9478 [00:00<?, ?it/s]

error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202403020501.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202403020504.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202403020504.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010604.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010605.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010604.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010605.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404030601.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404030601.bin
list index out of range
e

  0%|          | 0/9478 [00:00<?, ?it/s]

preparing raw return table


  0%|          | 0/9478 [00:00<?, ?it/s]

error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202403020501.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202403020504.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010604.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010605.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010604.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404010605.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404030601.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404030701.bin
list index out of range
error at c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\race\202404030704.bin
list index out of range
e

In [10]:
# テーブルの更新。元々のテーブルが存在しない場合は、新たに作成される。
preparing.update_rawdata(filepath=LocalPaths.RAW_RESULTS_PATH, new_df=results_new)
preparing.update_rawdata(filepath=LocalPaths.RAW_RACE_INFO_PATH, new_df=race_info_new)
preparing.update_rawdata(filepath=LocalPaths.RAW_RETURN_TABLES_PATH, new_df=return_tables_new)

## 2.x. 生成済み raw テーブル確認
`data/raw` に保存された各pickleの基本情報を表示します。存在しない場合はスキップします。

In [None]:
# data/raw 配下の pickle テーブル概要確認 + null率集計
import os
import pathlib
import pandas as pd
import datetime as dt
from modules.constants import LocalPaths

RAW_DIR = pathlib.Path('data/raw')

if not RAW_DIR.exists():
    print(f'ディレクトリが存在しません: {RAW_DIR.resolve()}')
else:
    pickle_files = sorted(RAW_DIR.glob('*.pickle'))
    if not pickle_files:
        print('pickleファイルが見つかりません。先に取得処理を実行してください。')
    else:
        summaries = []
        null_detail_rows = []  # 列単位 null 率詳細
        for p in pickle_files:
            info = {
                'file': p.name,
                'size_MB': round(p.stat().st_size / 1_000_000, 3)
            }
            try:
                df = pd.read_pickle(p)
                info['rows'] = len(df)
                info['cols'] = df.shape[1]
                info['memory_MB'] = round(df.memory_usage(deep=True).sum() / 1_000_000, 3)
                # 代表的なカラムサンプル（最大5件）
                info['sample_cols'] = ', '.join(list(df.columns[:5]))
                # 日付らしき列から範囲を取得
                date_cols = [c for c in df.columns if 'date' in c.lower()]
                date_range = ''
                for dc in date_cols:
                    try:
                        s = pd.to_datetime(df[dc], errors='coerce')
                        if s.notna().any():
                            date_range = f"{dc}:{s.min().date()}→{s.max().date()}"
                            break
                    except Exception:
                        pass
                info['date_range'] = date_range
                # 全体 null 率（セル全体）
                total_cells = df.shape[0] * (df.shape[1] if df.shape[0] else 0)
                info['overall_null_pct'] = round((df.isna().sum().sum() / total_cells) * 100, 2) if total_cells else 0.0
                # 列ごとの null 率
                col_null_pct = (df.isna().mean() * 100).sort_values(ascending=False)
                # 上位10列を詳細に保存（列が10未満なら全て）
                for col, pct in col_null_pct.head(10).items():
                    null_detail_rows.append({
                        'file': p.name,
                        'column': col,
                        'null_pct': round(pct, 2)
                    })
                # 列単位統計（最大値/平均値/中央値）
                info['max_col_null_pct'] = round(col_null_pct.iloc[0], 2) if not col_null_pct.empty else 0.0
                info['mean_col_null_pct'] = round(col_null_pct.mean(), 2) if not col_null_pct.empty else 0.0
                info['median_col_null_pct'] = round(col_null_pct.median(), 2) if not col_null_pct.empty else 0.0
            except Exception as e:
                info['rows'] = 'ERR'
                info['cols'] = 'ERR'
                info['memory_MB'] = 'ERR'
                info['sample_cols'] = f'load error: {e.__class__.__name__}'
                info['date_range'] = ''
                info['overall_null_pct'] = 'ERR'
                info['max_col_null_pct'] = 'ERR'
                info['mean_col_null_pct'] = 'ERR'
                info['median_col_null_pct'] = 'ERR'
            summaries.append(info)
        summary_df = pd.DataFrame(summaries)
        # 表示順を調整
        summary_cols_order = [
            'file','rows','cols','size_MB','memory_MB','overall_null_pct',
            'max_col_null_pct','mean_col_null_pct','median_col_null_pct',
            'sample_cols','date_range'
        ]
        summary_df = summary_df[summary_cols_order]
        display(summary_df)

        if null_detail_rows:
            null_detail_df = pd.DataFrame(null_detail_rows)
            # ファイル毎に null の高い列を横持ち要約（pivot）するオプション（必要であれば）
            display(null_detail_df)

        # 主要パスが指すファイルの存在と行数確認（存在しない場合も出力）
        main_paths = {
            'RAW_RESULTS_PATH': getattr(LocalPaths, 'RAW_RESULTS_PATH', None),
            'RAW_RACE_INFO_PATH': getattr(LocalPaths, 'RAW_RACE_INFO_PATH', None),
            'RAW_RETURN_TABLES_PATH': getattr(LocalPaths, 'RAW_RETURN_TABLES_PATH', None),
            'RAW_HORSE_INFO_PATH': getattr(LocalPaths, 'RAW_HORSE_INFO_PATH', None),
            'RAW_HORSE_RESULTS_PATH': getattr(LocalPaths, 'RAW_HORSE_RESULTS_PATH', None),
            'RAW_PEDS_PATH': getattr(LocalPaths, 'RAW_PEDS_PATH', None)
        }
        path_rows = []
        for key, path in main_paths.items():
            if path is None:
                path_rows.append({'name': key, 'path': None, 'exists': False, 'rows': None})
                continue
            exists = os.path.isfile(path)
            rows = None
            if exists:
                try:
                    rows = len(pd.read_pickle(path))
                except Exception:
                    rows = 'ERR'
            path_rows.append({'name': key, 'path': path, 'exists': exists, 'rows': rows})
        display(pd.DataFrame(path_rows))

In [27]:
# 既存のresultsデータを読み込んでテスト用horse_idリストを取得
results_new = pd.read_pickle(LocalPaths.RAW_RESULTS_PATH)
print(f"results_new loaded: {results_new.shape}")

# 先頭10頭のテスト用リスト作成
horse_id_list = results_new['horse_id'].unique()
horse_id_test_list = horse_id_list[:10]
print(f"テスト用horse_id: {horse_id_test_list}")

results_new loaded: (129472, 17)
テスト用horse_id: ['2021101429' '2021105872' '2021106854' '2021105553' '2021100648'
 '2021100159' '2021100265' '2021103762' '2020106547' '2020100879']


## 2.3. /horse/ディレクトリのデータ取得

In [11]:
%autoreload

In [16]:
# マスターファイルにNaN値が含まれているものだけ再スクレイピングする
import pandas as pd
import os
from modules.constants import LocalPaths

# マスターファイルからNaN値を持つhorse_idを特定
master_files = {
    'horse_id': 'horse_id.csv',
    'jockey_id': 'jockey_id.csv', 
    'trainer_id': 'trainer_id.csv',
    'owner_id': 'owner_id.csv',
    'breeder_id': 'breeder_id.csv'
}

nan_horse_ids = set()
print("=== マスターファイルのNaN値チェック ===")

# horse_info.pickleを読み込み
try:
    horse_info = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
    print(f"horse_info.pickle読み込み完了: {len(horse_info)}頭の馬データ")
except Exception as e:
    print(f"horse_info.pickleの読み込みエラー: {e}")
    horse_info = None

# 各マスターファイルをチェック
for master_type, filename in master_files.items():
    filepath = os.path.join(LocalPaths.MASTER_DIR, filename)
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        id_col = df.columns[0]
        
        # NaN値を持つ行を特定
        nan_rows = df[df[id_col].isna()]
        if len(nan_rows) > 0:
            print(f"\n{master_type}: {len(nan_rows)}個のNaN値を発見")
            
            if horse_info is not None:
                if master_type == 'horse_id':
                    # horse_idが直接NaNの場合（これは通常起こらない）
                    print(f"  -> horse_idが直接NaNになっている行: {len(nan_rows)}")
                else:
                    # 他のIDがNaNの馬を特定
                    col_mapping = {
                        'jockey_id': '騎手',
                        'trainer_id': '調教師', 
                        'owner_id': '馬主',
                        'breeder_id': '生産者'
                    }
                    if master_type in col_mapping:
                        target_col = col_mapping[master_type]
                        if target_col in horse_info.columns:
                            # 該当する列がNaNまたは空文字の馬を特定
                            nan_horses = horse_info[
                                (horse_info[target_col].isna()) | 
                                (horse_info[target_col] == '') |
                                (horse_info[target_col] == 'nan')
                            ]
                            for horse_id in nan_horses.index:
                                nan_horse_ids.add(horse_id)
                            print(f"  -> {target_col}がNaN/空の馬: {len(nan_horses)}頭")
                            if len(nan_horses) > 0:
                                print(f"      例: {list(nan_horses.index)[:5]}")
    else:
        print(f"{master_type}: ファイルが存在しません")

print(f"\n=== 再スクレイピング対象の特定結果 ===")
print(f"再スクレイピングが必要な馬ID数: {len(nan_horse_ids)}")

if len(nan_horse_ids) > 0:
    nan_horse_ids_list = sorted(list(nan_horse_ids))
    print(f"対象馬ID例: {nan_horse_ids_list[:10]}{'...' if len(nan_horse_ids_list) > 10 else ''}")
    
    print(f"\n=== 再スクレイピング実行オプション ===")
    print("以下の変数を設定して次のセルで実行してください：")
    print("re_scrape_horses = True  # この行のコメントアウトを外して実行")
    print("target_horse_ids = nan_horse_ids_list  # 対象馬IDリスト")
    
    # 変数を保存（次のセルで使用）
    globals()['nan_horse_ids_list'] = nan_horse_ids_list
    globals()['re_scrape_needed'] = True
else:
    print("再スクレイピングが必要な馬は見つかりませんでした。")
    globals()['re_scrape_needed'] = False

=== マスターファイルのNaN値チェック ===
horse_info.pickle読み込み完了: 19856頭の馬データ

breeder_id: 1個のNaN値を発見
  -> 生産者がNaN/空の馬: 0頭

=== 再スクレイピング対象の特定結果 ===
再スクレイピングが必要な馬ID数: 0
再スクレイピングが必要な馬は見つかりませんでした。


In [None]:
# horse_id_listのうち先頭10頭の馬のリストを作成し、スクレイピングテストする
horse_id_list = results_new['horse_id'].unique()
horse_id_test_list = horse_id_list[:10]  # 先頭10頭でテスト

print(f"全体の馬数: {len(horse_id_list)}")
print(f"テスト対象の馬数: {len(horse_id_test_list)}")
print(f"テスト対象馬ID: {horse_id_test_list}")

#htmlをスクレイピング
#すでにスクレイピングしてある馬をスキップしたい場合はskip=Trueにする
#すでにスクレイピングしてある馬でも、新たに出走した成績を更新したい場合はskip=Falseにする
html_files_horse = preparing.scrape_html_horse_with_master(
    horse_id_test_list, skip=False
    )

In [25]:
#追加で新たにスクレイピングされた数
len(html_files_horse)

19856

In [26]:
### scrape関数を実行せずに、保存してあるhtmlのパスを取得する場合、以下を実行 ###

target_date = '2025-09-20' #スクレイピングした日付を指定
# マスタの読み込み
update_master = pd.read_csv(
    LocalPaths.MASTER_RAW_HORSE_RESULTS_PATH,
    dtype=object
    )
# target_dateにスクレイピングしたhorse_idに絞り込む
filter = pd.to_datetime(update_master['updated_at']).dt.strftime('%Y-%m-%d') == target_date
horse_id_list = update_master[filter]['horse_id']

# binファイルのパスを取得
html_files_horse = []
for horse_id in tqdm(horse_id_list):
    file = glob.glob(os.path.join(LocalPaths.HTML_HORSE_DIR, horse_id+'*.bin'))[0]
    html_files_horse.append(file)
html_files_horse[:5]

  0%|          | 0/5502 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [13]:
# マスターファイルにNaN値が含まれているものだけ再スクレイピングする


In [17]:
# 再スクレイピング実行（上のセルで特定されたNaN馬のみ）
# 実行する場合は下の行のコメントアウトを外してください
# re_scrape_horses = True

if 're_scrape_horses' in locals() and re_scrape_horses and 're_scrape_needed' in globals() and re_scrape_needed:
    print("=== NaN値を持つ馬の再スクレイピング開始 ===")
    print(f"対象馬数: {len(nan_horse_ids_list)}")
    
    # 進捗表示用
    from tqdm import tqdm
    import time
    
    success_count = 0
    failed_ids = []
    
    for i, horse_id in enumerate(tqdm(nan_horse_ids_list, desc="再スクレイピング")):
        try:
            # 馬情報をスクレイピング
            scrape_horse_html(horse_id)
            success_count += 1
            
            # サーバー負荷軽減のため少し待機
            if i % 10 == 0 and i > 0:
                time.sleep(1)
                
        except Exception as e:
            print(f"エラー - 馬ID {horse_id}: {e}")
            failed_ids.append(horse_id)
            continue
    
    print(f"\n=== 再スクレイピング完了 ===")
    print(f"成功: {success_count}頭")
    print(f"失敗: {len(failed_ids)}頭")
    
    if failed_ids:
        print(f"失敗した馬ID: {failed_ids[:10]}{'...' if len(failed_ids) > 10 else ''}")
    
    print("\n再スクレイピング完了後は、該当セクションのデータ処理を再実行してください。")
    
elif 're_scrape_needed' in globals() and not re_scrape_needed:
    print("再スクレイピングが必要な馬は見つかりませんでした。")
    
else:
    print("再スクレイピングを実行するには以下を設定してください：")
    print("1. 上のセルを実行してNaN馬を特定")
    print("2. 're_scrape_horses = True' のコメントアウトを外す")
    print("3. このセルを再実行")

再スクレイピングが必要な馬は見つかりませんでした。


In [18]:
# 再スクレイピング後のデータ再処理
# 上の再スクレイピングが完了した後に実行してください
# reprocess_data = True  # コメントアウトを外して実行

if 'reprocess_data' in locals() and reprocess_data:
    print("=== 再スクレイピング後のデータ再処理開始 ===")
    
    # 1. 新しくスクレイピングしたHTMLから馬情報を再抽出
    print("1. 馬情報の再抽出...")
    from modules.preprocessing._horse_info_processor import HorseInfoProcessor
    
    # 新しいHTMLファイルのみ処理
    html_files_horse_new = []
    for horse_id in nan_horse_ids_list:
        html_file = os.path.join(LocalPaths.HTML_HORSE_DIR, f"{horse_id}.bin")
        if os.path.exists(html_file):
            html_files_horse_new.append(html_file)
    
    print(f"再処理対象HTMLファイル数: {len(html_files_horse_new)}")
    
    if len(html_files_horse_new) > 0:
        # 馬情報を再処理
        horse_info_processor = HorseInfoProcessor(html_files_horse_new)
        horse_info_new = horse_info_processor.scrape_horse_info()
        
        # 既存の馬情報に新しい情報をマージ
        try:
            horse_info_existing = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
            # 新しい情報で既存の情報を更新
            horse_info_updated = horse_info_existing.copy()
            for horse_id in horse_info_new.index:
                horse_info_updated.loc[horse_id] = horse_info_new.loc[horse_id]
            
            # バックアップ作成
            horse_info_existing.to_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle.bak'))
            
            # 更新されたデータを保存
            horse_info_updated.to_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
            
            print(f"馬情報更新完了: {len(horse_info_new)}頭の情報を更新")
            
        except Exception as e:
            print(f"馬情報の更新エラー: {e}")
    
    # 2. マスターファイルの再生成
    print("\n2. マスターファイルの再生成...")
    
    # horse_info.pickleから各種IDを抽出してマスターファイルを更新
    try:
        horse_info_updated = pd.read_pickle(os.path.join(LocalPaths.RAW_DIR, 'horse_info.pickle'))
        
        # 各マスターファイルを更新
        id_columns = {
            'horse_id.csv': ('index', 'horse_id'),  # インデックスがhorse_id
            'jockey_id.csv': ('騎手', 'jockey_id'),
            'trainer_id.csv': ('調教師', 'trainer_id'),
            'owner_id.csv': ('馬主', 'owner_id'),
            'breeder_id.csv': ('生産者', 'breeder_id')
        }
        
        for master_file, (col_name, id_type) in id_columns.items():
            master_path = os.path.join(LocalPaths.MASTER_DIR, master_file)
            
            if col_name == 'index':
                # horse_idの場合
                unique_ids = horse_info_updated.index.dropna().unique()
            else:
                # その他のIDの場合
                if col_name in horse_info_updated.columns:
                    unique_ids = horse_info_updated[col_name].dropna().unique()
                    unique_ids = [str(x) for x in unique_ids if str(x) not in ['nan', 'NaN', '']]
                else:
                    continue
            
            # 既存のマスターファイルを読み込み
            if os.path.exists(master_path):
                existing_master = pd.read_csv(master_path)
                existing_ids = set(existing_master.iloc[:, 0].dropna().astype(str))
            else:
                existing_ids = set()
                existing_master = pd.DataFrame(columns=[id_type, 'encoded_id'])
            
            # 新しいIDを追加
            new_ids = [id for id in unique_ids if str(id) not in existing_ids]
            
            if new_ids:
                # 新しいエンコードIDを生成
                max_encoded = existing_master['encoded_id'].max() if len(existing_master) > 0 else -1
                new_encoded = list(range(max_encoded + 1, max_encoded + 1 + len(new_ids)))
                
                # 新しいエントリを作成
                new_entries = pd.DataFrame({
                    id_type: new_ids,
                    'encoded_id': new_encoded
                })
                
                # マスターファイルを更新
                updated_master = pd.concat([existing_master, new_entries], ignore_index=True)
                updated_master.to_csv(master_path, index=False)
                
                print(f"{master_file}: {len(new_ids)}個の新しいIDを追加")
        
        print("\nマスターファイル再生成完了！")
        print("これで特徴量エンジニアリングを再実行できます。")
        
    except Exception as e:
        print(f"マスターファイル再生成エラー: {e}")
        
else:
    print("データ再処理を実行するには 'reprocess_data = True' を設定してください")

データ再処理を実行するには 'reprocess_data = True' を設定してください


## 再スクレイピング機能の使い方

### 手順:
1. **NaN値検出**: セル24を実行してマスターファイル内のNaN値を特定
2. **再スクレイピング実行**: セル25で `re_scrape_horses = True` のコメントアウトを外して実行
3. **データ再処理**: セル26で `reprocess_data = True` のコメントアウトを外して実行

### 注意事項:
- 再スクレイピングには時間がかかる場合があります
- サーバー負荷軽減のため、適度な間隔で実行されます
- バックアップファイルが自動生成されます

In [19]:
# data/html/horse/に保存されているhtml(binファイル)をリストにする

import glob
import os

horse_html_dir = LocalPaths.HTML_HORSE_DIR
print(f"馬HTMLディレクトリ: {horse_html_dir}")

html_files_horse = glob.glob(os.path.join(horse_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_horse)}")

# 最初の5ファイルを表示
html_files_horse[:5]


馬HTMLディレクトリ: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse
見つかったHTMLファイル数: 19856


['c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\horse\\2011106610.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\horse\\2012100683.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\horse\\2012103532.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\horse\\2012104463.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\horse\\2012104504.bin']

In [20]:
# 馬の基本情報テーブルの作成（修正済み関数使用）
%autoreload
horse_info_new = preparing.get_rawdata_horse_info(html_files_horse)

preparing raw horse_info table


  0%|          | 0/19856 [00:00<?, ?it/s]

In [None]:
# get_rawdata_horse_info関数のデバッグ: 1つのHTMLファイルで詳細確認
import pandas as pd
import re
from bs4 import BeautifulSoup
from numpy import nan as NaN

# テスト用に1つのHTMLファイルを選択
test_html_file = html_files_horse[0]
print(f"テスト対象ファイル: {test_html_file}")

# ファイルを読み込み
with open(test_html_file, 'rb') as f:
    html = f.read()

# horse_idを取得
horse_id = re.findall(r'horse\W(\d+)\.bin', test_html_file)[0]
print(f"horse_id: {horse_id}")

# HTMLファイルサイズとデコード確認
print(f"HTMLファイルサイズ: {len(html)} bytes")

# EUC-JPでデコード試行
for enc in ('euc-jp', 'cp932', 'utf-8'):
    try:
        text = html.decode(enc)
        print(f"デコード成功: {enc}")
        break
    except Exception as e:
        print(f"デコード失敗 {enc}: {e}")
        text = html.decode(enc, errors='ignore')
        print(f"エラー無視でデコード: {enc}")
        break

# BeautifulSoupで解析
soup = BeautifulSoup(text, 'lxml')
print(f"BeautifulSoup解析完了")

# プロフィールテーブルの確認
prof_table = (
    soup.select_one('table.db_prof_table[summary*="プロフィール"]')
    or soup.find('table', attrs={'summary': re.compile('プロフィール')})
)

if prof_table:
    print("プロフィールテーブル発見!")
    print(f"テーブルHTML（最初の500文字）: {str(prof_table)[:500]}")
    
    # テーブルを読み込む
    try:
        df = pd.read_html(str(prof_table), flavor='lxml')[0]
        print(f"DataFrame形状: {df.shape}")
        print(f"DataFrame列名: {list(df.columns)}")
        print("DataFrame内容:")
        display(df)
        
        # 2列形式の確認
        if df.shape[1] >= 2:
            df = df.iloc[:, :2]
            df.columns = ['項目', '値']
            df_info = df.set_index('項目').T
            print("転置後:")
            display(df_info)
        else:
            print(f"プロフィール表の列数が想定外: {df.shape[1]}")
            
    except Exception as e:
        print(f"pd.read_htmlエラー: {e}")
else:
    print("プロフィールテーブルが見つかりません")
    # 代替手段: すべてのテーブルを確認
    print("すべてのテーブルを確認:")
    tables = soup.find_all('table')
    for i, table in enumerate(tables):
        attrs = table.attrs
        print(f"テーブル {i+1}: {attrs}")
        if i < 3:  # 最初の3テーブルの内容を確認
            try:
                df_temp = pd.read_html(str(table))[0]
                print(f"  形状: {df_temp.shape}, 列名: {list(df_temp.columns[:3])}")
                if len(df_temp) > 0:
                    print(f"  最初の行: {df_temp.iloc[0].values[:3]}")
            except Exception as e:
                print(f"  読み込みエラー: {e}")

# ID抽出の確認
def extract_id(selector, pattern):
    a = soup.select_one(selector)
    if a and a.has_attr('href'):
        m = re.search(pattern, a['href'])
        if m:
            return m.group(1)
    return NaN

trainer_id = extract_id('a[href^="/trainer/"]', r'/trainer/([^/]+)/')
owner_id   = extract_id('a[href^="/owner/"]',   r'/owner/([^/]+)/')
breeder_id = extract_id('a[href^="/breeder/"]', r'/breeder/([^/]+)/')

print(f"trainer_id: {trainer_id}")
print(f"owner_id: {owner_id}")
print(f"breeder_id: {breeder_id}")

# 関連リンクの確認
trainer_links = soup.select('a[href^="/trainer/"]')
owner_links = soup.select('a[href^="/owner/"]')
breeder_links = soup.select('a[href^="/breeder/"]')

print(f"調教師リンク数: {len(trainer_links)}")
print(f"馬主リンク数: {len(owner_links)}")
print(f"生産者リンク数: {len(breeder_links)}")

if trainer_links:
    print(f"調教師リンク例: {trainer_links[0].get('href')}")
if owner_links:
    print(f"馬主リンク例: {owner_links[0].get('href')}")
if breeder_links:
    print(f"生産者リンク例: {breeder_links[0].get('href')}")

In [None]:
# エンコーディング問題の詳細調査（chardetなし）
# 手動で各エンコーディングを試行

encodings_to_try = ['utf-8', 'euc-jp', 'cp932', 'shift_jis', 'iso-2022-jp']

successful_encoding = None
for encoding in encodings_to_try:
    try:
        text_decoded = html.decode(encoding)
        print(f"デコード成功: {encoding}")
        
        # BeautifulSoupで解析
        soup_test = BeautifulSoup(text_decoded, 'lxml')
        
        # プロフィールテーブルの検索
        prof_table_test = (
            soup_test.find('table', class_='db_prof_table') or
            soup_test.find('table', attrs={'summary': re.compile('プロフィール')}) or
            soup_test.select_one('table[summary*="プロフィール"]')
        )
        
        if prof_table_test:
            print(f"  → プロフィールテーブル発見！エンコーディング: {encoding}")
            print(f"  → テーブル属性: {prof_table_test.attrs}")
            
            # テーブル内容を確認
            try:
                df_test = pd.read_html(str(prof_table_test))[0]
                print(f"  → DataFrame形状: {df_test.shape}")
                print("  → DataFrame内容:")
                display(df_test)
                
                # 2列形式に変換してみる
                if df_test.shape[1] >= 2:
                    df_test = df_test.iloc[:, :2]
                    df_test.columns = ['項目', '値']
                    df_info_test = df_test.set_index('項目').T
                    print("  → 転置後:")
                    display(df_info_test)
                    
                successful_encoding = encoding
                break
                
            except Exception as e:
                print(f"  → pd.read_htmlエラー: {e}")
        else:
            print(f"  → プロフィールテーブル見つからず")
            
    except UnicodeDecodeError as e:
        print(f"デコード失敗: {encoding} - {str(e)[:100]}")
        continue
    except Exception as e:
        print(f"その他のエラー {encoding}: {e}")
        continue

if successful_encoding:
    print(f"\n成功したエンコーディング: {successful_encoding}")
    
    # 正しいエンコーディングでの最終確認
    text_final = html.decode(successful_encoding)
    soup_final = BeautifulSoup(text_final, 'lxml')
    
    # すべてのテーブルを確認
    tables_final = soup_final.find_all('table')
    print(f"\n全テーブル数: {len(tables_final)}")
    for i, table in enumerate(tables_final):
        attrs = table.attrs
        summary = attrs.get('summary', '')
        class_names = attrs.get('class', [])
        print(f"テーブル {i+1}: class={class_names}, summary='{summary}'")
        
        # プロフィールらしいテーブルを詳しく確認
        if 'db_prof_table' in class_names or 'プロフィール' in summary:
            try:
                df_detail = pd.read_html(str(table))[0]
                print(f"  → 詳細形状: {df_detail.shape}")
                print(f"  → 列名: {list(df_detail.columns)}")
                if len(df_detail) > 0:
                    print(f"  → 最初の行: {df_detail.iloc[0].values}")
            except Exception as e:
                print(f"  → 読み込みエラー: {e}")
                
else:
    print("\nどのエンコーディングでもプロフィールテーブルが見つかりませんでした")
    
    # HTMLの先頭部分を確認
    print(f"\nHTML先頭500文字（バイナリ）:")
    print(html[:500])
    
    # UTF-8での強制解析
    try:
        text_force = html.decode('utf-8', errors='ignore')
        print(f"\nUTF-8強制デコード後の先頭1000文字:")
        print(text_force[:1000])
    except Exception as e:
        print(f"UTF-8強制デコードエラー: {e}")

In [None]:
# get_rawdata_horse_info関数の修正版を作成・テスト
def get_rawdata_horse_info_fixed(html_path_list: list):
    """
    horseページのhtmlを受け取って、馬の基本情報のDataFrameに変換する関数（修正版）。
    - UTF-8優先でデコード
    - プロフィールテーブルを確実に特定
    - 調教師/馬主/生産者IDを確実に抽出
    """
    print('preparing raw horse_info table (fixed version)')
    out_rows = []

    for html_path in tqdm(html_path_list):
        try:
            with open(html_path, 'rb') as f:
                raw = f.read()

            # 1) エンコーディング優先順位: UTF-8 → EUC-JP → CP932
            text = None
            for encoding in ['utf-8', 'euc-jp', 'cp932']:
                try:
                    text = raw.decode(encoding)
                    break
                except UnicodeDecodeError:
                    continue
            
            if text is None:
                print(f'エンコーディング失敗: {html_path}')
                continue

            soup = BeautifulSoup(text, 'lxml')

            # 2) プロフィールテーブルの確実な特定
            prof_table = (
                soup.find('table', class_='db_prof_table') or
                soup.find('table', attrs={'summary': re.compile('プロフィール')}) or
                soup.select_one('table[summary*="プロフィール"]')
            )
            
            if prof_table is None:
                print(f'プロフィールテーブル見つからず: {html_path}')
                continue

            # 3) テーブルを読み込む（StringIOを使用して警告を回避）
            from io import StringIO
            df = pd.read_html(StringIO(str(prof_table)))[0]
            
            # 左列を項目名、右列を値として転置（1行化）
            if df.shape[1] >= 2:
                df = df.iloc[:, :2]
                df.columns = ['項目', '値']
                df_info = df.set_index('項目').T
            else:
                print(f'プロフィールテーブルの列数が想定外: {html_path}')
                continue

            # 4) 各IDをより確実に抽出
            def extract_id(selector, pattern):
                a = soup.select_one(selector)
                if a and a.has_attr('href'):
                    m = re.search(pattern, a['href'])
                    if m:
                        return m.group(1)
                return NaN

            trainer_id = extract_id('a[href^="/trainer/"]', r'/trainer/([^/]+)/')
            owner_id   = extract_id('a[href^="/owner/"]',   r'/owner/([^/]+)/')
            breeder_id = extract_id('a[href^="/breeder/"]', r'/breeder/([^/]+)/')

            df_info['trainer_id'] = trainer_id
            df_info['owner_id']   = owner_id
            df_info['breeder_id'] = breeder_id

            # 5) インデックスを horse_id に
            horse_id_m = re.search(r'horse\W(\d+)\.bin', html_path)
            if horse_id_m:
                horse_id = horse_id_m.group(1)
                df_info.index = [horse_id]
                out_rows.append(df_info)
            else:
                print(f'horse_id抽出失敗: {html_path}')
                
        except Exception as e:
            print(f'処理エラー {html_path}: {e}')
            continue

    if not out_rows:
        print('処理できたhorse_infoデータがありません')
        return pd.DataFrame()

    horse_info_df = pd.concat(out_rows, axis=0)
    print(f'horse_info処理完了: {horse_info_df.shape}')
    return horse_info_df

# 修正版でテスト実行
horse_info_new_fixed = get_rawdata_horse_info_fixed(html_files_horse)
print(f"\n修正版の結果: {horse_info_new_fixed.shape}")
if len(horse_info_new_fixed) > 0:
    print(f"列名: {list(horse_info_new_fixed.columns[:5])}")
    print(f"最初の数行:")
    display(horse_info_new_fixed.head())

In [21]:
# 馬の基本情報テーブルの更新
preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info_new)

In [22]:
# 馬の過去成績テーブルの作成
horse_results_new = preparing.get_rawdata_horse_results(html_files_horse)

preparing raw horse_results table


  0%|          | 0/19856 [00:00<?, ?it/s]

horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100091.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100276.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100280.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100287.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100387.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100433.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100387.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100433.bin
horse_re

In [None]:
# 結果確認
print(f"horse_results_new shape: {horse_results_new.shape}")
print(f"horse_results_new type: {type(horse_results_new)}")
if len(horse_results_new) > 0:
    print(f"列名: {list(horse_results_new.columns[:5])}")
    print(f"最初の数行:")
    display(horse_results_new.head())

In [None]:
# 問題診断: 1つのHTMLファイルでpd.read_htmlの動作確認
import pandas as pd
import re
from bs4 import BeautifulSoup

# テスト用に1つのHTMLファイルを選択
test_html_file = html_files_horse[0]
print(f"テスト対象ファイル: {test_html_file}")

# ファイルを読み込み
with open(test_html_file, 'rb') as f:
    html = f.read()

# horse_idを取得
horse_id = re.findall(r'horse\W(\d+)\.bin', test_html_file)[0]
print(f"horse_id: {horse_id}")

# BeautifulSoupで解析
soup = BeautifulSoup(html, "lxml")
print(f"HTMLファイルサイズ: {len(html)} bytes")

# テーブル要素の確認
tables = soup.find_all('table')
print(f"テーブル数: {len(tables)}")

for i, table in enumerate(tables):
    class_names = table.get('class', [])
    print(f"テーブル {i+1}: class={class_names}")

# pd.read_htmlでテーブル読み込みテスト
try:
    dfs = pd.read_html(html)
    print(f"pd.read_htmlで読み込めたテーブル数: {len(dfs)}")
    
    for i, df in enumerate(dfs):
        print(f"DataFrame {i+1}: 形状={df.shape}, 列名={list(df.columns[:3])}")
        if len(df) > 0:
            print(f"  最初の行: {df.iloc[0].values[:3]}")
        print()
        
except Exception as e:
    print(f"pd.read_htmlエラー: {e}")

# 過去成績テーブルの直接確認
race_results_table = soup.find('table', class_='db_h_race_results')
if race_results_table:
    print("過去成績テーブルが見つかりました!")
    rows = race_results_table.find_all('tr')
    print(f"行数: {len(rows)}")
    if len(rows) > 1:  # ヘッダー行を除く
        first_data_row = rows[1]
        cells = first_data_row.find_all(['td', 'th'])
        print(f"最初のデータ行のセル数: {len(cells)}")
        print(f"最初のセルの内容: {cells[0].get_text(strip=True) if cells else 'なし'}")
else:
    print("過去成績テーブルが見つかりませんでした")

In [None]:
# get_rawdata_horse_resultsの修正版をテスト
import pandas as pd
import re
from tqdm.auto import tqdm

def get_rawdata_horse_results_fixed(html_path_list: list):
    """
    horseページのhtmlを受け取って、馬の過去成績のDataFrameに変換する関数。
    AJAX実装対応版: 過去成績テーブルはインデックス1（2番目）にある
    """
    print('preparing raw horse_results table (fixed version)')
    horse_results = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, 'rb') as f:
            try:
                # 保存してあるbinファイルを読み込む
                html = f.read()

                # AJAX実装では、過去成績テーブルは2番目（インデックス1）
                dfs = pd.read_html(html)
                
                # テーブル数の確認
                if len(dfs) < 2:
                    print(f'horse_results insufficient tables: {len(dfs)} tables in {html_path}')
                    continue
                
                # 過去成績テーブルは2番目（インデックス1）
                df = dfs[1]
                
                # 新馬の競走馬レビューが付いた場合、
                # 列名に0が付与されるため、次のhtmlへ飛ばす
                if df.columns[0] == 0:
                    print('horse_results empty case1 {}'.format(html_path))
                    continue

                horse_id = re.findall(r'horse\W(\d+)\.bin', html_path)[0]

                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                print(f'Successfully processed {horse_id}: {df.shape}')

            # 競走データが無い場合（新馬）を飛ばす
            except IndexError:
                print('horse_results empty case2 {}'.format(html_path))
                continue
            except Exception as e:
                print(f'horse_results error in {html_path}: {e}')
                continue

    if not horse_results:
        print("警告: 処理できた過去成績データがありません")
        return pd.DataFrame()

    # pd.DataFrame型にして一つのデータにまとめる
    horse_results_df = pd.concat([horse_results[key] for key in horse_results])

    # 列名に半角スペースがあれば除去する
    horse_results_df = horse_results_df.rename(columns=lambda x: x.replace(' ', ''))

    return horse_results_df

# 修正版でテスト実行
horse_results_new_fixed = get_rawdata_horse_results_fixed(html_files_horse)
print(f"\n修正版の結果: {horse_results_new_fixed.shape}")
if len(horse_results_new_fixed) > 0:
    print(f"列名: {list(horse_results_new_fixed.columns[:5])}")
    print(f"最初の数行:")
    display(horse_results_new_fixed.head())

In [23]:
# テーブルの更新
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_new)

In [None]:
display(horse_info_new)

# 馬の基本情報テーブルの行数を取得
print(f"馬の基本情報テーブルの行数: {len(horse_info_new)}")


In [None]:
display(horse_results_new)

## 2.4. /ped/ディレクトリのデータ取得

In [28]:
html_files_peds = preparing.scrape_html_ped(horse_id_list, skip=True) #htmlをスクレイピング

  0%|          | 0/19762 [00:00<?, ?it/s]

horse_id 2021101429 skipped
horse_id 2021105872 skipped
horse_id 2021106854 skipped
horse_id 2021105553 skipped
horse_id 2021100648 skipped
horse_id 2021100159 skipped
horse_id 2021100265 skipped
horse_id 2021103762 skipped
horse_id 2020106547 skipped
horse_id 2020100879 skipped
horse_id 2020100391 skipped
horse_id 2020104346 skipped
horse_id 2020102908 skipped
horse_id 2020100551 skipped
horse_id 2020105380 skipped
horse_id 2020104250 skipped
horse_id 2020104087 skipped
horse_id 2020106395 skipped
horse_id 2020107102 skipped
horse_id 2020101946 skipped
horse_id 2020104215 skipped
horse_id 2020103939 skipped
horse_id 2020102347 skipped
horse_id 2020105632 skipped
horse_id 2020103126 skipped
horse_id 2020100863 skipped
horse_id 2020105061 skipped
horse_id 2020101252 skipped
horse_id 2020101681 skipped
horse_id 2020102875 skipped
horse_id 2020105472 skipped
horse_id 2020106970 skipped
horse_id 2020100036 skipped
horse_id 2020101849 skipped
horse_id 2020105354 skipped
horse_id 2020103018 

In [None]:
# === 血統データ完全スクレイピングの実装 ===
import os
from modules.preparing import scrape_html_ped

# 1. 全てのHTMLファイルから完全な馬IDリストを作成
complete_horse_id_list = [os.path.splitext(os.path.basename(f))[0] for f in html_files_horse]
print(f"完全な馬IDリスト作成完了: {len(complete_horse_id_list)}件")

# 2. 既存の血統データを確認
existing_peds_ids = set([os.path.splitext(os.path.basename(f))[0] for f in html_files_peds])
print(f"既存の血統データ: {len(existing_peds_ids)}件")

# 3. 不足している血統データのIDを特定
missing_ped_ids = set(complete_horse_id_list) - existing_peds_ids
missing_ped_ids_list = sorted(list(missing_ped_ids))
print(f"不足している血統データ: {len(missing_ped_ids_list)}件")

# 4. 年代別の不足状況確認
missing_by_year = {}
for horse_id in missing_ped_ids_list:
    year = horse_id[:4] if horse_id[:4].isdigit() else "unknown"
    missing_by_year[year] = missing_by_year.get(year, 0) + 1

print(f"\n年代別の不足血統データ:")
for year in sorted(missing_by_year.keys()):
    print(f"  {year}年: {missing_by_year[year]}件")

print(f"\n=== スクレイピング実行 ===")
print(f"対象馬数: {len(missing_ped_ids_list)}件")
print(f"推定時間: {len(missing_ped_ids_list) * 2 / 60:.1f}分")

# 5. 不足している血統データをスクレイピング実行
# 警告: これは時間がかかる処理です（約163分）
# バッチ処理で実行することを推奨
print("\n血統データスクレイピングを開始...")
new_ped_files = scrape_html_ped(missing_ped_ids_list, skip=False)
print(f"スクレイピング完了: {len(new_ped_files)}件の新しい血統データを取得")

# 6. 結果確認
import glob
updated_html_files_peds = glob.glob(os.path.join(LocalPaths.HTML_PED_DIR, "*.bin"))
print(f"更新後の血統データファイル数: {len(updated_html_files_peds)}")
print(f"馬情報ファイル数: {len(html_files_horse)}")
print(f"差分: {len(html_files_horse) - len(updated_html_files_peds)}")

In [None]:
# まず少数でテスト（最初の100件）
test_missing_ids = missing_ped_ids_list[:100]
print(f"テスト対象: {len(test_missing_ids)}件")
test_ped_files = scrape_html_ped(test_missing_ids, skip=False)
print(f"テスト完了: {len(test_ped_files)}件")

# 成功を確認後、残りを実行
remaining_ids = missing_ped_ids_list[100:]
remaining_ped_files = scrape_html_ped(remaining_ids, skip=False)

In [None]:
# 血統データスクレイピングのテスト: 少数の馬でテスト実行
# 先頭5頭の馬IDを使用してテスト
test_horse_ids = horse_id_test_list[:5]  # 最初の5頭でテスト
print(f"テスト対象馬ID: {test_horse_ids}")

%autoreload

In [None]:
# 血統HTMLファイルのスクレイピングテスト（5頭のみ）
print("血統HTMLファイルのスクレイピングを開始...")
html_files_peds_test = preparing.scrape_html_ped(test_horse_ids, skip=False)
print(f"スクレイピング完了: {len(html_files_peds_test)}件のHTMLファイル")

# 取得されたファイルパスの確認
if html_files_peds_test:
    print("取得されたファイル:")
    for i, file_path in enumerate(html_files_peds_test[:3]):  # 最初の3件表示
        print(f"  {i+1}: {file_path}")
    if len(html_files_peds_test) > 3:
        print(f"  ... 他{len(html_files_peds_test)-3}件")
else:
    print("取得されたHTMLファイルがありません（既存ファイルがスキップされた可能性）")

In [None]:
# 血統テーブルの作成テスト
print("血統テーブルの作成を開始...")
peds_test = preparing.get_rawdata_peds(html_files_peds_test)

print(f"血統テーブル作成完了: {peds_test.shape}")
if len(peds_test) > 0:
    print(f"列名: {list(peds_test.columns)}")
    print("血統データサンプル:")
    display(peds_test.head())
    
    # 各列のnull値確認
    print("\n各列のnull値の数:")
    print(peds_test.isnull().sum())
else:
    print("血統テーブルが空です。HTMLファイルの構造を確認します。")

In [None]:
# 血統HTMLファイルの構造解析
import re
from bs4 import BeautifulSoup

# テスト用に1つの血統HTMLファイルを詳しく調べる
test_ped_file = html_files_peds_test[0]
print(f"解析対象ファイル: {test_ped_file}")

# ファイルを読み込み
with open(test_ped_file, 'rb') as f:
    ped_html = f.read()

print(f"HTMLファイルサイズ: {len(ped_html)} bytes")

# エンコーディングテスト
encodings = ['utf-8', 'euc-jp', 'cp932']
ped_text = None
successful_encoding = None

for encoding in encodings:
    try:
        ped_text = ped_html.decode(encoding)
        successful_encoding = encoding
        print(f"デコード成功: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"デコード失敗: {encoding}")
        continue

if ped_text:
    # BeautifulSoupで解析
    ped_soup = BeautifulSoup(ped_text, 'lxml')
    
    # HTMLの基本情報
    title = ped_soup.find('title')
    print(f"ページタイトル: {title.text if title else 'なし'}")
    
    # テーブル要素の確認
    tables = ped_soup.find_all('table')
    print(f"テーブル数: {len(tables)}")
    
    for i, table in enumerate(tables[:5]):  # 最初の5テーブルを確認
        attrs = table.attrs
        class_names = attrs.get('class', [])
        summary = attrs.get('summary', '')
        print(f"テーブル {i+1}: class={class_names}, summary='{summary}'")
        
        # テーブル内容の簡単な確認
        try:
            df_ped = pd.read_html(str(table))[0]
            print(f"  形状: {df_ped.shape}")
            if len(df_ped) > 0 and df_ped.shape[1] > 0:
                print(f"  列名: {list(df_ped.columns[:3])}")
                print(f"  最初の行: {df_ped.iloc[0].values[:3]}")
        except Exception as e:
            print(f"  読み込みエラー: {e}")
        
        # 血統らしいキーワードを含むテーブルかチェック
        table_text = table.get_text()
        if any(keyword in table_text for keyword in ['父', '母', '祖父', '血統']):
            print(f"  → 血統関連テーブルの可能性あり")
    
    # 血統ツリー構造の確認（td要素のクラス名など）
    print("\n血統ツリー関連の要素:")
    bloodline_elements = ped_soup.find_all(['td', 'div'], class_=re.compile(r'(blood|ped|pedigree)', re.I))
    print(f"血統関連要素数: {len(bloodline_elements)}")
    
    if bloodline_elements:
        for elem in bloodline_elements[:3]:
            print(f"  要素: {elem.name}, class: {elem.get('class')}, text: {elem.get_text()[:50]}...")
            
else:
    print("すべてのエンコーディングでデコードに失敗しました")

In [None]:
# get_rawdata_peds関数の修正版を作成・テスト
def get_rawdata_peds_fixed(html_path_list: list):
    """
    horse/pedページのhtmlを受け取って、血統のDataFrameに変換する関数（修正版）。
    - EUC-JP優先でデコード
    - 血統テーブルを確実に特定
    - 血統horse_idを確実に抽出
    """
    print('preparing raw peds table (fixed version)')
    peds = {}
    
    for html_path in tqdm(html_path_list):
        try:
            with open(html_path, 'rb') as f:
                raw = f.read()

            # 1) エンコーディング優先順位: EUC-JP → UTF-8 → CP932
            text = None
            for encoding in ['euc-jp', 'utf-8', 'cp932']:
                try:
                    text = raw.decode(encoding)
                    break
                except UnicodeDecodeError:
                    continue
            
            if text is None:
                print(f'エンコーディング失敗: {html_path}')
                continue

            # horse_idを取得
            horse_id = re.findall(r'ped\W(\d+)\.bin', html_path)[0]

            # htmlをsoupオブジェクトに変換
            soup = BeautifulSoup(text, "lxml")

            peds_id_list = []

            # 2) 血統データからhorse_idを取得する
            blood_table = soup.find("table", attrs={"summary": "5代血統表"})
            if blood_table is None:
                # 代替検索
                blood_table = soup.find("table", class_="blood_table")
                
            if blood_table is None:
                print(f'血統テーブル見つからず: {html_path}')
                continue

            horse_a_list = blood_table.find_all("a", attrs={"href": re.compile(r"^/horse/\w{10}")})

            for a in horse_a_list:
                # 血統データのhorse_idを抜き出す
                work_peds_id = re.findall(r'/horse/(\w{10})', a["href"])[0]
                peds_id_list.append(work_peds_id)

            peds[horse_id] = peds_id_list
            print(f'血統ID取得成功 {horse_id}: {len(peds_id_list)}個')

        except Exception as e:
            print(f'処理エラー {html_path}: {e}')
            continue

    if not peds:
        print('処理できた血統データがありません')
        return pd.DataFrame()

    # pd.DataFrame型にして一つのデータにまとめて、列名をpeds_0, ..., peds_61にする
    peds_df = pd.DataFrame.from_dict(peds, orient='index').add_prefix('peds_')
    print(f'血統データ処理完了: {peds_df.shape}')
    return peds_df

# 修正版でテスト実行
peds_test_fixed = get_rawdata_peds_fixed(html_files_peds_test)
print(f"\n修正版の結果: {peds_test_fixed.shape}")
if len(peds_test_fixed) > 0:
    print(f"列名サンプル: {list(peds_test_fixed.columns[:10])}")
    print("血統データサンプル:")
    display(peds_test_fixed.head())

In [None]:
# 血統テーブル内のリンク構造を詳しく調査
test_ped_file = html_files_peds_test[0]

with open(test_ped_file, 'rb') as f:
    raw = f.read()

# EUC-JPでデコード
text = raw.decode('euc-jp')
soup = BeautifulSoup(text, "lxml")

# 血統テーブルを取得
blood_table = soup.find("table", attrs={"summary": "5代血統表"})
if blood_table:
    print("血統テーブル発見!")
    
    # テーブル内のすべてのリンクを確認
    all_links = blood_table.find_all("a")
    print(f"血統テーブル内の全リンク数: {len(all_links)}")
    
    # 最初の10リンクを詳しく確認
    for i, link in enumerate(all_links[:10]):
        href = link.get('href', '')
        text_content = link.get_text()
        print(f"リンク {i+1}: href='{href}', text='{text_content}'")
    
    # horse関連のリンクパターンを確認
    horse_patterns = [
        r"^/horse/\w{10}",  # 現在のパターン
        r"/horse/",         # より広いパターン
        r"horse",           # 最も広いパターン
    ]
    
    for pattern in horse_patterns:
        matches = blood_table.find_all("a", attrs={"href": re.compile(pattern)})
        print(f"パターン '{pattern}' にマッチするリンク数: {len(matches)}")
        if matches:
            for j, match in enumerate(matches[:3]):
                print(f"  例{j+1}: {match.get('href')}")
    
    # 実際のhorse_idの抽出テスト
    print("\n実際のhorse_id抽出テスト:")
    for link in all_links[:5]:
        href = link.get('href', '')
        if '/horse/' in href:
            print(f"リンク: {href}")
            # 異なる抽出パターンをテスト
            patterns = [
                r'/horse/(\w{10})',
                r'/horse/(\w+)',
                r'horse/(\w+)',
            ]
            for p in patterns:
                matches = re.findall(p, href)
                if matches:
                    print(f"  パターン '{p}': {matches}")
else:
    print("血統テーブルが見つかりません")

In [None]:
# 修正版のhorse_id抽出パターンを作成・テスト
import re

# 正しいパターンで抽出テスト
correct_pattern = r'https://db\.netkeiba\.com/horse/(\w{10})/$'

test_ped_file = html_files_peds_test[0]
with open(test_ped_file, 'rb') as f:
    raw = f.read()

text = raw.decode('euc-jp')
soup = BeautifulSoup(text, "lxml")
blood_table = soup.find("table", attrs={"summary": "5代血統表"})

if blood_table:
    # 馬の詳細ページのみを抽出
    horse_links = blood_table.find_all("a", attrs={"href": re.compile(correct_pattern)})
    print(f"馬の詳細ページリンク数: {len(horse_links)}")
    
    # horse_idを抽出
    horse_ids = []
    for link in horse_links:
        href = link.get('href')
        match = re.search(correct_pattern, href)
        if match:
            horse_id = match.group(1)
            horse_ids.append(horse_id)
            horse_name = link.get_text().strip()
            print(f"  {horse_id}: {horse_name}")
    
    print(f"\n抽出されたhorse_id数: {len(horse_ids)}")
    print(f"ユニークなhorse_id数: {len(set(horse_ids))}")
else:
    print("血統テーブルが見つかりません")

In [None]:
# 修正版get_rawdata_peds関数を作成・テスト
def get_rawdata_peds_fixed(horse_id_list):
    """
    血統データを取得する修正版関数
    """
    
    peds_new = pd.DataFrame()
    
    for horse_id in tqdm(horse_id_list):
        try:
            # ファイルパスを作成
            html_path = f"data/html/ped/{horse_id}.bin"
            
            # ファイルが存在するかチェック
            if not os.path.exists(html_path):
                print(f"ファイルが見つかりません: {html_path}")
                continue
            
            # HTMLファイルを読み込み
            with open(html_path, 'rb') as f:
                raw = f.read()
            
            # エンコーディングを試行（UTF-8 → EUC-JP → CP932）
            for encoding in ['utf-8', 'euc-jp', 'cp932']:
                try:
                    text = raw.decode(encoding)
                    break
                except UnicodeDecodeError:
                    continue
            else:
                print(f"デコードに失敗しました: {horse_id}")
                continue
            
            # BeautifulSoupでパース
            soup = BeautifulSoup(text, "lxml")
            
            # 血統テーブルを検索
            blood_table = soup.find("table", attrs={"summary": "5代血統表"})
            
            if blood_table is None:
                print(f"血統テーブルが見つかりません: {horse_id}")
                continue
            
            # 修正された正規表現パターンで血統IDを抽出
            pattern = r'https://db\.netkeiba\.com/horse/(\w{10})/$'
            horse_links = blood_table.find_all("a", attrs={"href": re.compile(pattern)})
            
            # horse_idを抽出
            peds_horse_ids = []
            for link in horse_links:
                href = link.get('href')
                match = re.search(pattern, href)
                if match:
                    peds_horse_ids.append(match.group(1))
            
            # 結果をDataFrameに追加
            if peds_horse_ids:
                temp_df = pd.DataFrame({
                    'horse_id': [horse_id] * len(peds_horse_ids),
                    'peds_horse_id': peds_horse_ids
                })
                peds_new = pd.concat([peds_new, temp_df], ignore_index=True)
                print(f"血統ID取得成功 {horse_id}: {len(peds_horse_ids)}個")
            else:
                print(f"血統IDが取得できませんでした: {horse_id}")
                
        except Exception as e:
            print(f"エラーが発生しました {horse_id}: {e}")
            continue
    
    return peds_new

# テスト用の血統HTMLファイルから馬IDを抽出
test_horse_ids = []
for file_path in html_files_peds_test:
    file_name = os.path.basename(file_path)
    horse_id = file_name.replace('.bin', '')
    test_horse_ids.append(horse_id)

print(f"テスト用馬ID: {test_horse_ids}")

# テスト実行
print("修正版get_rawdata_peds関数をテスト中...")
peds_new_fixed = get_rawdata_peds_fixed(test_horse_ids)
print(f"\n修正版関数の結果: {peds_new_fixed.shape}")
print(peds_new_fixed.head(10))

In [None]:
# モジュールをリロードして修正版をテスト
import importlib
import sys

# モジュールをリロード
if 'modules.preparing._get_rawdata' in sys.modules:
    del sys.modules['modules.preparing._get_rawdata']

from modules.preparing._get_rawdata import get_rawdata_peds

# 修正版の関数でテスト
print("元ファイルの修正版get_rawdata_peds関数をテスト中...")
peds_from_module = get_rawdata_peds(html_files_peds_test)
print(f"\nモジュール関数の結果: {peds_from_module.shape}")
print(peds_from_module.head())

In [None]:
# 血統データスクレイピングテストの結果確認
print("=== 血統データスクレイピングテスト結果 ===")
print(f"テスト馬数: {len(test_horse_ids)}頭")
print(f"取得血統データ: {peds_from_module.shape[0]}頭 × {peds_from_module.shape[1]}血統ID")
print(f"総血統ID数: {peds_from_module.shape[0] * peds_from_module.shape[1]}個")

# 各馬の血統ID数を確認
print(f"\n各馬の血統ID数:")
for horse_id in test_horse_ids:
    if horse_id in peds_from_module.index:
        non_null_count = peds_from_module.loc[horse_id].notna().sum()
        print(f"  {horse_id}: {non_null_count}個")

# サンプル血統データを表示
print(f"\n血統データサンプル（馬ID: {test_horse_ids[0]}）:")
sample_row = peds_from_module.loc[test_horse_ids[0]]
sample_peds = sample_row.dropna().head(10)
for i, (col, horse_id) in enumerate(sample_peds.items()):
    print(f"  {col}: {horse_id}")

print("\n血統データテーブル作成テスト: ✅ 成功")

In [29]:
import glob
import os

# LocalPathsからHTML血統ディレクトリを取得
ped_html_dir = LocalPaths.HTML_PED_DIR
print(f"血統HTMLディレクトリ: {ped_html_dir}")

# globでbinファイルを検索
html_files_peds = glob.glob(os.path.join(ped_html_dir, "*.bin"))
print(f"見つかったHTMLファイル数: {len(html_files_peds)}")

# 最初の5ファイルを表示
html_files_peds[:5]

血統HTMLディレクトリ: c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\ped
見つかったHTMLファイル数: 19856


['c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\ped\\2011106610.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\ped\\2012100683.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\ped\\2012103532.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\ped\\2012104463.bin',
 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\data\\html\\ped\\2012104504.bin']

In [None]:
# HTML数の詳細分析：馬の基本情報と血統データの差異調査
import os

# 馬の基本情報のIDリストを作成
horse_ids = set([os.path.splitext(os.path.basename(f))[0] for f in html_files_horse])
print(f"馬の基本情報のユニークID数: {len(horse_ids)}")

# 血統データのIDリストを作成
peds_ids = set([os.path.splitext(os.path.basename(f))[0] for f in html_files_peds])
print(f"血統データのユニークID数: {len(peds_ids)}")

print(f"\n数の違い: {len(horse_ids) - len(peds_ids)} 件")

# 血統データがない馬IDを特定
missing_peds = horse_ids - peds_ids
print(f"\n血統データがない馬IDの数: {len(missing_peds)}")

if len(missing_peds) > 0:
    print("\n血統データがない馬IDの例（最初の10件）:")
    missing_list = sorted(list(missing_peds))
    for horse_id in missing_list[:10]:
        print(f"  {horse_id}")
        
    # 年代別の分析
    print("\n年代別の分析（血統データ不足）:")
    year_analysis = {}
    for horse_id in missing_peds:
        year = horse_id[:4]  # 馬IDの最初の4文字が年
        year_analysis[year] = year_analysis.get(year, 0) + 1
    
    for year in sorted(year_analysis.keys()):
        print(f"  {year}年: {year_analysis[year]}件")

# 逆に馬の基本情報がない血統データがあるかも確認
missing_horse_info = peds_ids - horse_ids
print(f"\n馬の基本情報がない血統データの数: {len(missing_horse_info)}")
if len(missing_horse_info) > 0:
    print("馬の基本情報がない血統IDの例（最初の5件）:")
    for ped_id in list(missing_horse_info)[:5]:
        print(f"  {ped_id}")

In [None]:
# スクレイピングプロセスの差異調査
print("=== スクレイピングプロセス差異分析 ===")

# 1. データ収集時期の違いを確認
print("\n1. 収集データの時期分析:")
horse_years = {}
peds_years = {}

for f in html_files_horse:
    horse_id = os.path.splitext(os.path.basename(f))[0]
    year = horse_id[:4] if horse_id[:4].isdigit() else "unknown"
    horse_years[year] = horse_years.get(year, 0) + 1

for f in html_files_peds:
    ped_id = os.path.splitext(os.path.basename(f))[0]
    year = ped_id[:4] if ped_id[:4].isdigit() else "unknown"
    peds_years[year] = peds_years.get(year, 0) + 1

print("年別収集数の比較:")
all_years = sorted(set(horse_years.keys()) | set(peds_years.keys()))
for year in all_years:
    horse_count = horse_years.get(year, 0)
    peds_count = peds_years.get(year, 0)
    diff = horse_count - peds_count
    print(f"  {year}年: 馬情報{horse_count:5d}件, 血統{peds_count:5d}件, 差分{diff:5d}件")

# 2. ファイルサイズの分析（スクレイピング成功率の推定）
print("\n2. ファイルサイズ分析（サンプル）:")
import random

sample_horse_files = random.sample(html_files_horse, min(50, len(html_files_horse)))
sample_peds_files = random.sample(html_files_peds, min(50, len(html_files_peds)))

horse_sizes = [os.path.getsize(f) for f in sample_horse_files if os.path.exists(f)]
peds_sizes = [os.path.getsize(f) for f in sample_peds_files if os.path.exists(f)]

if horse_sizes and peds_sizes:
    print(f"馬情報ファイル平均サイズ: {sum(horse_sizes)/len(horse_sizes):.0f} bytes")
    print(f"血統ファイル平均サイズ: {sum(peds_sizes)/len(peds_sizes):.0f} bytes")
    print(f"小さすぎるファイル（<1000 bytes）の割合:")
    small_horse = sum(1 for s in horse_sizes if s < 1000) / len(horse_sizes) * 100
    small_peds = sum(1 for s in peds_sizes if s < 1000) / len(peds_sizes) * 100
    print(f"  馬情報: {small_horse:.1f}%")
    print(f"  血統: {small_peds:.1f}%")

In [30]:
peds_new = preparing.get_rawdata_peds(html_files_peds) #血統テーブルの作成
preparing.update_rawdata(LocalPaths.RAW_PEDS_PATH, peds_new) #テーブルの更新

preparing raw peds table


  0%|          | 0/19856 [00:00<?, ?it/s]

In [None]:
display(peds_new)

# 3. データ加工

In [31]:
#モジュールを更新した際、notebookに反映させるために使用。
#すでにインポートしてあるモジュールの更新が反映される。
%autoreload

In [32]:
#前処理
results_processor = preprocessing.ResultsProcessor(filepath=LocalPaths.RAW_RESULTS_PATH)
race_info_processor = preprocessing.RaceInfoProcessor(filepath=LocalPaths.RAW_RACE_INFO_PATH)
return_processor = preprocessing.ReturnProcessor(filepath=LocalPaths.RAW_RETURN_TABLES_PATH)
horse_info_processor = preprocessing.HorseInfoProcessor(
    filepath=LocalPaths.RAW_HORSE_INFO_PATH)
horse_results_processor = preprocessing.HorseResultsProcessor(
    filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
peds_processor = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)

              着順  枠番  馬番         馬名  性齢    斤量    騎手     タイム     着差     単勝  \
202301010101   1   5   5    サトミノキラリ  牡2  55.0  横山武史  1:09.5    NaN    1.2   
202301010101   2   8   8     ベアゴーゴー  牝2  55.0   浜中俊  1:09.5     クビ    4.1   
202301010101   3   6   6  ハピアーザンエバー  牡2  55.0  藤岡佑介  1:10.0  2.1/2   59.9   
202301010101   4   4   4  デビルシズカチャン  牝2  55.0  ルメール  1:10.2  1.1/2   16.6   
202301010101   5   1   1   ウィスピースノー  牝2  55.0  吉田隼人  1:10.3    1/2   23.9   
...           ..  ..  ..        ...  ..   ...   ...     ...    ...    ...   
202510020812  14   5   9   テーオースパロー  牡6  58.0  吉村誠之  1:08.8     ハナ   71.3   
202510020812  15   3   5  タガノスペルノヴァ  牡7  58.0  川須栄彦  1:08.9    1/2  120.4   
202510020812  16   2   3   サトノプリエール  牝6  56.0  団野大成  1:09.1      1   31.8   
202510020812  17   4   7      インテンソ  セ5  58.0  泉谷楓真  1:09.3  1.1/2  167.9   
202510020812  18   3   6       ガリレイ  セ8  58.0  角田大和  1:09.4    1/2   77.2   

                人気       馬体重       調教師    horse_id jockey_id trainer_id  \


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


馬の過去成績を集計しつつ、前処理の済みの全てのテーブルをマージする処理

In [33]:
# ターゲットエンコーディング時に「馬の成績」として扱う項目
TARGET_COLS = [
        HorseResultsCols.RANK,
        HorseResultsCols.PRIZE,
        HorseResultsCols.RANK_DIFF, 
        'first_corner',
        'final_corner',
        'first_to_rank',
        'first_to_final',
        'final_to_rank',
        'time_seconds'
        ]
# horse_id列と共に、ターゲットエンコーディングの対象にする列
GROUP_COLS = [
        'course_len',
        'race_type',
        HorseResultsCols.PLACE
        ]

data_merger = preprocessing.DataMerger(
        results_processor,
        race_info_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
)
# 処理実行
data_merger.merge()

separating horse results by date


  0%|          | 0/290 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/290 [00:00<?, ?it/s]

In [34]:
#カテゴリ変数の処理
feature_enginnering = preprocessing.FeatureEngineering(data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

  new_target_master = pd.concat([target_master, new_target]).set_index(target_col)['encoded_id']
  new_target_master = pd.concat([target_master, new_target]).set_index(target_col)['encoded_id']
  new_target_master = pd.concat([target_master, new_target]).set_index(target_col)['encoded_id']
  new_target_master = pd.concat([target_master, new_target]).set_index(target_col)['encoded_id']
  new_target_master = pd.concat([target_master, new_target]).set_index(target_col)['encoded_id']


In [35]:
#保存
#tmpは一時保存用のディレクトリ
feature_enginnering.featured_data.to_pickle('data/tmp/featured_data_20250920.pickle')

# 4. 学習

In [36]:
keiba_ai = training.KeibaAIFactory.create(feature_enginnering.featured_data) #モデル作成
keiba_ai.train_with_tuning() #パラメータチューニングをして学習

[I 2025-09-20 20:59:02,965] A new study created in memory with name: no-name-d9fc76e6-29bd-4a29-a6cb-7c4805e62cae
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.457763:  14%|#4        | 1/7 [00:01<00:08,  1.46s/it][I 2025-09-20 20:59:04,437] Trial 0 finished with value: 0.4577627902391479 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  14%|#4        | 1/7 [00:01<00:08,  1.46s/it][I 2025-09-20 20:59:04,437] Trial 0 finished with value: 0.4577627902391479 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  14%|#4        | 1/7 [00:01<00:08,  1.46s/it]

[100]	valid_0's binary_logloss: 0.402595	valid_1's binary_logloss: 0.457947
Early stopping, best iteration is:
[98]	valid_0's binary_logloss: 0.403389	valid_1's binary_logloss: 0.457763
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM]

feature_fraction, val_score: 0.457763:  29%|##8       | 2/7 [00:02<00:07,  1.44s/it][I 2025-09-20 20:59:05,866] Trial 1 finished with value: 0.45816846424226965 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  29%|##8       | 2/7 [00:02<00:07,  1.44s/it][I 2025-09-20 20:59:05,866] Trial 1 finished with value: 0.45816846424226965 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  29%|##8       | 2/7 [00:02<00:07,  1.44s/it]

Early stopping, best iteration is:
[81]	valid_0's binary_logloss: 0.410603	valid_1's binary_logloss: 0.458168
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

feature_fraction, val_score: 0.457763:  43%|####2     | 3/7 [00:04<00:05,  1.48s/it][I 2025-09-20 20:59:07,398] Trial 2 finished with value: 0.45914686185804815 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  43%|####2     | 3/7 [00:04<00:05,  1.48s/it][I 2025-09-20 20:59:07,398] Trial 2 finished with value: 0.45914686185804815 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  43%|####2     | 3/7 [00:04<00:05,  1.48s/it]

Early stopping, best iteration is:
[81]	valid_0's binary_logloss: 0.409393	valid_1's binary_logloss: 0.459147
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

feature_fraction, val_score: 0.457763:  57%|#####7    | 4/7 [00:05<00:04,  1.48s/it][I 2025-09-20 20:59:08,886] Trial 3 finished with value: 0.4595343005138396 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  57%|#####7    | 4/7 [00:05<00:04,  1.48s/it][I 2025-09-20 20:59:08,886] Trial 3 finished with value: 0.4595343005138396 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  57%|#####7    | 4/7 [00:05<00:04,  1.48s/it]

Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.408866	valid_1's binary_logloss: 0.459534
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

feature_fraction, val_score: 0.457763:  71%|#######1  | 5/7 [00:07<00:02,  1.47s/it][I 2025-09-20 20:59:10,317] Trial 4 finished with value: 0.45874250229036373 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  71%|#######1  | 5/7 [00:07<00:02,  1.47s/it][I 2025-09-20 20:59:10,317] Trial 4 finished with value: 0.45874250229036373 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  71%|#######1  | 5/7 [00:07<00:02,  1.47s/it]

Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.415166	valid_1's binary_logloss: 0.458743
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

feature_fraction, val_score: 0.457763:  86%|########5 | 6/7 [00:08<00:01,  1.47s/it][I 2025-09-20 20:59:11,797] Trial 5 finished with value: 0.4590923642877547 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  86%|########5 | 6/7 [00:08<00:01,  1.47s/it][I 2025-09-20 20:59:11,797] Trial 5 finished with value: 0.4590923642877547 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.4577627902391479.
feature_fraction, val_score: 0.457763:  86%|########5 | 6/7 [00:08<00:01,  1.47s/it]

Early stopping, best iteration is:
[69]	valid_0's binary_logloss: 0.415576	valid_1's binary_logloss: 0.459092
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011864 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011864 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

feature_fraction, val_score: 0.457726: 100%|##########| 7/7 [00:10<00:00,  1.48s/it][I 2025-09-20 20:59:13,303] Trial 6 finished with value: 0.4577263650406801 and parameters: {'feature_fraction': 0.4}. Best is trial 6 with value: 0.4577263650406801.
feature_fraction, val_score: 0.457726: 100%|##########| 7/7 [00:10<00:00,  1.48s/it]
feature_fraction, val_score: 0.457726: 100%|##########| 7/7 [00:10<00:00,  1.48s/it][I 2025-09-20 20:59:13,303] Trial 6 finished with value: 0.4577263650406801 and parameters: {'feature_fraction': 0.4}. Best is trial 6 with value: 0.4577263650406801.
feature_fraction, val_score: 0.457726: 100%|##########| 7/7 [00:10<00:00,  1.48s/it]


[100]	valid_0's binary_logloss: 0.404415	valid_1's binary_logloss: 0.458068
Early stopping, best iteration is:
[135]	valid_0's binary_logloss: 0.389759	valid_1's binary_logloss: 0.457726


num_leaves, val_score: 0.457726:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457726:   5%|5         | 1/20 [00:01<00:31,  1.64s/it][I 2025-09-20 20:59:14,953] Trial 7 finished with value: 0.46332783899647206 and parameters: {'num_leaves': 140}. Best is trial 7 with value: 0.46332783899647206.
num_leaves, val_score: 0.457726:   5%|5         | 1/20 [00:01<00:31,  1.64s/it][I 2025-09-20 20:59:14,953] Trial 7 finished with value: 0.46332783899647206 and parameters: {'num_leaves': 140}. Best is trial 7 with value: 0.46332783899647206.
num_leaves, val_score: 0.457726:   5%|5         | 1/20 [00:01<00:31,  1.64s/it]

Early stopping, best iteration is:
[55]	valid_0's binary_logloss: 0.342289	valid_1's binary_logloss: 0.463328
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  10%|#         | 2/20 [00:03<00:28,  1.57s/it][I 2025-09-20 20:59:16,467] Trial 8 finished with value: 0.4603517885161204 and parameters: {'num_leaves': 72}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  10%|#         | 2/20 [00:03<00:28,  1.57s/it][I 2025-09-20 20:59:16,467] Trial 8 finished with value: 0.4603517885161204 and parameters: {'num_leaves': 72}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  10%|#         | 2/20 [00:03<00:28,  1.57s/it]

Early stopping, best iteration is:
[73]	valid_0's binary_logloss: 0.372514	valid_1's binary_logloss: 0.460352
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  15%|#5        | 3/20 [00:04<00:26,  1.56s/it][I 2025-09-20 20:59:18,024] Trial 9 finished with value: 0.4618164334494164 and parameters: {'num_leaves': 110}. Best is trial 8 with value: 0.4603517885161204.
[I 2025-09-20 20:59:18,024] Trial 9 finished with value: 0.4618164334494164 and parameters: {'num_leaves': 110}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  15%|#5        | 3/20 [00:04<00:26,  1.56s/it]

Early stopping, best iteration is:
[57]	valid_0's binary_logloss: 0.358504	valid_1's binary_logloss: 0.461816
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  20%|##        | 4/20 [00:06<00:26,  1.68s/it][I 2025-09-20 20:59:19,866] Trial 10 finished with value: 0.46546063977024243 and parameters: {'num_leaves': 251}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  20%|##        | 4/20 [00:06<00:26,  1.68s/it][I 2025-09-20 20:59:19,866] Trial 10 finished with value: 0.46546063977024243 and parameters: {'num_leaves': 251}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  20%|##        | 4/20 [00:06<00:26,  1.68s/it]

Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.311052	valid_1's binary_logloss: 0.465461
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  25%|##5       | 5/20 [00:08<00:26,  1.74s/it][I 2025-09-20 20:59:21,738] Trial 11 finished with value: 0.4651800473667685 and parameters: {'num_leaves': 254}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  25%|##5       | 5/20 [00:08<00:26,  1.74s/it][I 2025-09-20 20:59:21,738] Trial 11 finished with value: 0.4651800473667685 and parameters: {'num_leaves': 254}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  25%|##5       | 5/20 [00:08<00:26,  1.74s/it]

Early stopping, best iteration is:
[44]	valid_0's binary_logloss: 0.307122	valid_1's binary_logloss: 0.46518
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222


num_leaves, val_score: 0.457726:  30%|###       | 6/20 [00:09<00:22,  1.59s/it][I 2025-09-20 20:59:23,013] Trial 12 finished with value: 0.47160168178814754 and parameters: {'num_leaves': 2}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  30%|###       | 6/20 [00:09<00:22,  1.59s/it][I 2025-09-20 20:59:23,013] Trial 12 finished with value: 0.47160168178814754 and parameters: {'num_leaves': 2}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  30%|###       | 6/20 [00:09<00:22,  1.59s/it]

[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.467571	valid_1's binary_logloss: 0.475827
[200]	valid_0's binary_logloss: 0.462023	valid_1's binary_logloss: 0.471929
Early stopping, best iteration is:
[196]	valid_0's binary_logloss: 0.462178	valid_1's binary_logloss: 0.471602
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010634 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010634 seconds.
You c

num_leaves, val_score: 0.457726:  35%|###5      | 7/20 [00:11<00:21,  1.63s/it][I 2025-09-20 20:59:24,739] Trial 13 finished with value: 0.46490414317985945 and parameters: {'num_leaves': 181}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  35%|###5      | 7/20 [00:11<00:21,  1.63s/it][I 2025-09-20 20:59:24,739] Trial 13 finished with value: 0.46490414317985945 and parameters: {'num_leaves': 181}. Best is trial 8 with value: 0.4603517885161204.
num_leaves, val_score: 0.457726:  35%|###5      | 7/20 [00:11<00:21,  1.63s/it]

Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.322072	valid_1's binary_logloss: 0.464904
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  40%|####      | 8/20 [00:12<00:18,  1.54s/it][I 2025-09-20 20:59:26,078] Trial 14 finished with value: 0.45869875997714266 and parameters: {'num_leaves': 8}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  40%|####      | 8/20 [00:12<00:18,  1.54s/it][I 2025-09-20 20:59:26,078] Trial 14 finished with value: 0.45869875997714266 and parameters: {'num_leaves': 8}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  40%|####      | 8/20 [00:12<00:18,  1.54s/it]

Early stopping, best iteration is:
[162]	valid_0's binary_logloss: 0.43596	valid_1's binary_logloss: 0.458699
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  45%|####5     | 9/20 [00:14<00:17,  1.60s/it][I 2025-09-20 20:59:27,813] Trial 15 finished with value: 0.4638308634892269 and parameters: {'num_leaves': 194}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  45%|####5     | 9/20 [00:14<00:17,  1.60s/it][I 2025-09-20 20:59:27,813] Trial 15 finished with value: 0.4638308634892269 and parameters: {'num_leaves': 194}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  45%|####5     | 9/20 [00:14<00:17,  1.60s/it]

Early stopping, best iteration is:
[48]	valid_0's binary_logloss: 0.325065	valid_1's binary_logloss: 0.463831
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010539 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010539 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  50%|#####     | 10/20 [00:15<00:15,  1.55s/it][I 2025-09-20 20:59:29,244] Trial 16 finished with value: 0.45988014804782174 and parameters: {'num_leaves': 57}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  50%|#####     | 10/20 [00:15<00:15,  1.55s/it][I 2025-09-20 20:59:29,244] Trial 16 finished with value: 0.45988014804782174 and parameters: {'num_leaves': 57}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  50%|#####     | 10/20 [00:15<00:15,  1.55s/it]

Early stopping, best iteration is:
[78]	valid_0's binary_logloss: 0.383385	valid_1's binary_logloss: 0.45988
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

num_leaves, val_score: 0.457726:  55%|#####5    | 11/20 [00:17<00:14,  1.62s/it][I 2025-09-20 20:59:31,056] Trial 17 finished with value: 0.46403071687041697 and parameters: {'num_leaves': 198}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  55%|#####5    | 11/20 [00:17<00:14,  1.62s/it][I 2025-09-20 20:59:31,056] Trial 17 finished with value: 0.46403071687041697 and parameters: {'num_leaves': 198}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  55%|#####5    | 11/20 [00:17<00:14,  1.62s/it]

Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.312482	valid_1's binary_logloss: 0.464031
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  60%|######    | 12/20 [00:19<00:13,  1.63s/it][I 2025-09-20 20:59:32,691] Trial 18 finished with value: 0.46376903079468373 and parameters: {'num_leaves': 139}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  60%|######    | 12/20 [00:19<00:13,  1.63s/it][I 2025-09-20 20:59:32,691] Trial 18 finished with value: 0.46376903079468373 and parameters: {'num_leaves': 139}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  60%|######    | 12/20 [00:19<00:13,  1.63s/it]

Early stopping, best iteration is:
[55]	valid_0's binary_logloss: 0.34257	valid_1's binary_logloss: 0.463769
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

num_leaves, val_score: 0.457726:  65%|######5   | 13/20 [00:20<00:11,  1.59s/it][I 2025-09-20 20:59:34,172] Trial 19 finished with value: 0.45988014804782174 and parameters: {'num_leaves': 57}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  65%|######5   | 13/20 [00:20<00:11,  1.59s/it][I 2025-09-20 20:59:34,172] Trial 19 finished with value: 0.45988014804782174 and parameters: {'num_leaves': 57}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  65%|######5   | 13/20 [00:20<00:11,  1.59s/it]

Early stopping, best iteration is:
[78]	valid_0's binary_logloss: 0.383385	valid_1's binary_logloss: 0.45988
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

num_leaves, val_score: 0.457726:  70%|#######   | 14/20 [00:22<00:09,  1.56s/it][I 2025-09-20 20:59:35,679] Trial 20 finished with value: 0.461571793645563 and parameters: {'num_leaves': 95}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  70%|#######   | 14/20 [00:22<00:09,  1.56s/it][I 2025-09-20 20:59:35,679] Trial 20 finished with value: 0.461571793645563 and parameters: {'num_leaves': 95}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  70%|#######   | 14/20 [00:22<00:09,  1.56s/it]

Early stopping, best iteration is:
[57]	valid_0's binary_logloss: 0.370226	valid_1's binary_logloss: 0.461572
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010782 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010782 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  75%|#######5  | 15/20 [00:24<00:08,  1.62s/it][I 2025-09-20 20:59:37,446] Trial 21 finished with value: 0.46411474223534166 and parameters: {'num_leaves': 164}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  75%|#######5  | 15/20 [00:24<00:08,  1.62s/it][I 2025-09-20 20:59:37,446] Trial 21 finished with value: 0.46411474223534166 and parameters: {'num_leaves': 164}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  75%|#######5  | 15/20 [00:24<00:08,  1.62s/it]

Early stopping, best iteration is:
[60]	valid_0's binary_logloss: 0.319115	valid_1's binary_logloss: 0.464115
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010561 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010561 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  80%|########  | 16/20 [00:26<00:06,  1.71s/it][I 2025-09-20 20:59:39,350] Trial 22 finished with value: 0.4656602407386425 and parameters: {'num_leaves': 228}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  80%|########  | 16/20 [00:26<00:06,  1.71s/it][I 2025-09-20 20:59:39,350] Trial 22 finished with value: 0.4656602407386425 and parameters: {'num_leaves': 228}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  80%|########  | 16/20 [00:26<00:06,  1.71s/it]

Early stopping, best iteration is:
[57]	valid_0's binary_logloss: 0.28924	valid_1's binary_logloss: 0.46566
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores d

num_leaves, val_score: 0.457726:  85%|########5 | 17/20 [00:27<00:05,  1.75s/it][I 2025-09-20 20:59:41,197] Trial 23 finished with value: 0.46528267877452684 and parameters: {'num_leaves': 219}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  85%|########5 | 17/20 [00:27<00:05,  1.75s/it][I 2025-09-20 20:59:41,197] Trial 23 finished with value: 0.46528267877452684 and parameters: {'num_leaves': 219}. Best is trial 14 with value: 0.45869875997714266.
num_leaves, val_score: 0.457726:  85%|########5 | 17/20 [00:27<00:05,  1.75s/it]

Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.301848	valid_1's binary_logloss: 0.465283
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726:  90%|######### | 18/20 [00:29<00:03,  1.64s/it][I 2025-09-20 20:59:42,584] Trial 24 finished with value: 0.4584663170224704 and parameters: {'num_leaves': 30}. Best is trial 24 with value: 0.4584663170224704.
num_leaves, val_score: 0.457726:  90%|######### | 18/20 [00:29<00:03,  1.64s/it][I 2025-09-20 20:59:42,584] Trial 24 finished with value: 0.4584663170224704 and parameters: {'num_leaves': 30}. Best is trial 24 with value: 0.4584663170224704.
num_leaves, val_score: 0.457726:  90%|######### | 18/20 [00:29<00:03,  1.64s/it]

[100]	valid_0's binary_logloss: 0.405641	valid_1's binary_logloss: 0.458768
Early stopping, best iteration is:
[107]	valid_0's binary_logloss: 0.402424	valid_1's binary_logloss: 0.458466
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM

num_leaves, val_score: 0.457726:  95%|#########5| 19/20 [00:30<00:01,  1.61s/it][I 2025-09-20 20:59:44,108] Trial 25 finished with value: 0.46207473963186196 and parameters: {'num_leaves': 103}. Best is trial 24 with value: 0.4584663170224704.
num_leaves, val_score: 0.457726:  95%|#########5| 19/20 [00:30<00:01,  1.61s/it][I 2025-09-20 20:59:44,108] Trial 25 finished with value: 0.46207473963186196 and parameters: {'num_leaves': 103}. Best is trial 24 with value: 0.4584663170224704.
num_leaves, val_score: 0.457726:  95%|#########5| 19/20 [00:30<00:01,  1.61s/it]

Early stopping, best iteration is:
[64]	valid_0's binary_logloss: 0.355184	valid_1's binary_logloss: 0.462075
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

num_leaves, val_score: 0.457726: 100%|##########| 20/20 [00:32<00:00,  1.56s/it][I 2025-09-20 20:59:45,558] Trial 26 finished with value: 0.4586297339630494 and parameters: {'num_leaves': 32}. Best is trial 24 with value: 0.4584663170224704.
num_leaves, val_score: 0.457726: 100%|##########| 20/20 [00:32<00:00,  1.61s/it]
num_leaves, val_score: 0.457726: 100%|##########| 20/20 [00:32<00:00,  1.56s/it][I 2025-09-20 20:59:45,558] Trial 26 finished with value: 0.4586297339630494 and parameters: {'num_leaves': 32}. Best is trial 24 with value: 0.4584663170224704.
num_leaves, val_score: 0.457726: 100%|##########| 20/20 [00:32<00:00,  1.61s/it]


[100]	valid_0's binary_logloss: 0.402973	valid_1's binary_logloss: 0.458898
Early stopping, best iteration is:
[105]	valid_0's binary_logloss: 0.400564	valid_1's binary_logloss: 0.45863


bagging, val_score: 0.457726:   0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.457726:  10%|#         | 1/10 [00:01<00:11,  1.32s/it][I 2025-09-20 20:59:46,883] Trial 27 finished with value: 0.46150577775951934 and parameters: {'bagging_fraction': 0.509934059998701, 'bagging_freq': 2}. Best is trial 27 with value: 0.46150577775951934.
bagging, val_score: 0.457726:  10%|#         | 1/10 [00:01<00:11,  1.32s/it][I 2025-09-20 20:59:46,883] Trial 27 finished with value: 0.46150577775951934 and parameters: {'bagging_fraction': 0.509934059998701, 'bagging_freq': 2}. Best is trial 27 with value: 0.46150577775951934.
bagging, val_score: 0.457726:  10%|#         | 1/10 [00:01<00:11,  1.32s/it]

Early stopping, best iteration is:
[79]	valid_0's binary_logloss: 0.414662	valid_1's binary_logloss: 0.461506
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010920 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010920 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

bagging, val_score: 0.457726:  20%|##        | 2/10 [00:02<00:10,  1.36s/it][I 2025-09-20 20:59:48,271] Trial 28 finished with value: 0.4590783280720997 and parameters: {'bagging_fraction': 0.995516796793986, 'bagging_freq': 7}. Best is trial 28 with value: 0.4590783280720997.
bagging, val_score: 0.457726:  20%|##        | 2/10 [00:02<00:10,  1.36s/it][I 2025-09-20 20:59:48,271] Trial 28 finished with value: 0.4590783280720997 and parameters: {'bagging_fraction': 0.995516796793986, 'bagging_freq': 7}. Best is trial 28 with value: 0.4590783280720997.
bagging, val_score: 0.457726:  20%|##        | 2/10 [00:02<00:10,  1.36s/it]

Early stopping, best iteration is:
[84]	valid_0's binary_logloss: 0.4116	valid_1's binary_logloss: 0.459078
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores d

bagging, val_score: 0.457726:  30%|###       | 3/10 [00:04<00:09,  1.41s/it][I 2025-09-20 20:59:49,749] Trial 29 finished with value: 0.458669943738456 and parameters: {'bagging_fraction': 0.8730732883159426, 'bagging_freq': 6}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  30%|###       | 3/10 [00:04<00:09,  1.41s/it][I 2025-09-20 20:59:49,749] Trial 29 finished with value: 0.458669943738456 and parameters: {'bagging_fraction': 0.8730732883159426, 'bagging_freq': 6}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  30%|###       | 3/10 [00:04<00:09,  1.41s/it]

[100]	valid_0's binary_logloss: 0.404097	valid_1's binary_logloss: 0.458905
Early stopping, best iteration is:
[109]	valid_0's binary_logloss: 0.399812	valid_1's binary_logloss: 0.45867
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM]

bagging, val_score: 0.457726:  40%|####      | 4/10 [00:05<00:08,  1.40s/it][I 2025-09-20 20:59:51,128] Trial 30 finished with value: 0.461582165882949 and parameters: {'bagging_fraction': 0.43883386690550286, 'bagging_freq': 1}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  40%|####      | 4/10 [00:05<00:08,  1.40s/it][I 2025-09-20 20:59:51,128] Trial 30 finished with value: 0.461582165882949 and parameters: {'bagging_fraction': 0.43883386690550286, 'bagging_freq': 1}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  40%|####      | 4/10 [00:05<00:08,  1.40s/it]

[100]	valid_0's binary_logloss: 0.405998	valid_1's binary_logloss: 0.462012
Early stopping, best iteration is:
[91]	valid_0's binary_logloss: 0.409858	valid_1's binary_logloss: 0.461582
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM]

bagging, val_score: 0.457726:  50%|#####     | 5/10 [00:06<00:06,  1.38s/it][I 2025-09-20 20:59:52,483] Trial 31 finished with value: 0.4604026278021265 and parameters: {'bagging_fraction': 0.6761289428986079, 'bagging_freq': 4}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  50%|#####     | 5/10 [00:06<00:06,  1.38s/it][I 2025-09-20 20:59:52,483] Trial 31 finished with value: 0.4604026278021265 and parameters: {'bagging_fraction': 0.6761289428986079, 'bagging_freq': 4}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  50%|#####     | 5/10 [00:06<00:06,  1.38s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.411209	valid_1's binary_logloss: 0.460403
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010730 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010730 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

bagging, val_score: 0.457726:  60%|######    | 6/10 [00:08<00:05,  1.39s/it][I 2025-09-20 20:59:53,882] Trial 32 finished with value: 0.4595854212070941 and parameters: {'bagging_fraction': 0.7089779038530023, 'bagging_freq': 4}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  60%|######    | 6/10 [00:08<00:05,  1.39s/it][I 2025-09-20 20:59:53,882] Trial 32 finished with value: 0.4595854212070941 and parameters: {'bagging_fraction': 0.7089779038530023, 'bagging_freq': 4}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  60%|######    | 6/10 [00:08<00:05,  1.39s/it]

Early stopping, best iteration is:
[85]	valid_0's binary_logloss: 0.411177	valid_1's binary_logloss: 0.459585
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

bagging, val_score: 0.457726:  70%|#######   | 7/10 [00:09<00:04,  1.39s/it][I 2025-09-20 20:59:55,281] Trial 33 finished with value: 0.46013015885348296 and parameters: {'bagging_fraction': 0.6284215432565106, 'bagging_freq': 5}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  70%|#######   | 7/10 [00:09<00:04,  1.39s/it][I 2025-09-20 20:59:55,281] Trial 33 finished with value: 0.46013015885348296 and parameters: {'bagging_fraction': 0.6284215432565106, 'bagging_freq': 5}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  70%|#######   | 7/10 [00:09<00:04,  1.39s/it]

[100]	valid_0's binary_logloss: 0.40488	valid_1's binary_logloss: 0.46024
Early stopping, best iteration is:
[101]	valid_0's binary_logloss: 0.404412	valid_1's binary_logloss: 0.46013
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [

bagging, val_score: 0.457726:  80%|########  | 8/10 [00:11<00:02,  1.41s/it][I 2025-09-20 20:59:56,723] Trial 34 finished with value: 0.4587315886783178 and parameters: {'bagging_fraction': 0.8337970026983913, 'bagging_freq': 2}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  80%|########  | 8/10 [00:11<00:02,  1.41s/it][I 2025-09-20 20:59:56,723] Trial 34 finished with value: 0.4587315886783178 and parameters: {'bagging_fraction': 0.8337970026983913, 'bagging_freq': 2}. Best is trial 29 with value: 0.458669943738456.
bagging, val_score: 0.457726:  80%|########  | 8/10 [00:11<00:02,  1.41s/it]

[100]	valid_0's binary_logloss: 0.40437	valid_1's binary_logloss: 0.458894
Early stopping, best iteration is:
[97]	valid_0's binary_logloss: 0.405694	valid_1's binary_logloss: 0.458732
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] 

bagging, val_score: 0.457726:  90%|######### | 9/10 [00:12<00:01,  1.41s/it][I 2025-09-20 20:59:58,180] Trial 35 finished with value: 0.45817498825190117 and parameters: {'bagging_fraction': 0.998021017074001, 'bagging_freq': 7}. Best is trial 35 with value: 0.45817498825190117.
bagging, val_score: 0.457726:  90%|######### | 9/10 [00:12<00:01,  1.41s/it][I 2025-09-20 20:59:58,180] Trial 35 finished with value: 0.45817498825190117 and parameters: {'bagging_fraction': 0.998021017074001, 'bagging_freq': 7}. Best is trial 35 with value: 0.45817498825190117.
bagging, val_score: 0.457726:  90%|######### | 9/10 [00:12<00:01,  1.41s/it]

Early stopping, best iteration is:
[87]	valid_0's binary_logloss: 0.40993	valid_1's binary_logloss: 0.458175
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

bagging, val_score: 0.457726: 100%|##########| 10/10 [00:14<00:00,  1.42s/it][I 2025-09-20 20:59:59,579] Trial 36 finished with value: 0.4594343100788162 and parameters: {'bagging_fraction': 0.9898496126986905, 'bagging_freq': 7}. Best is trial 35 with value: 0.45817498825190117.
bagging, val_score: 0.457726: 100%|##########| 10/10 [00:14<00:00,  1.40s/it]
bagging, val_score: 0.457726: 100%|##########| 10/10 [00:14<00:00,  1.42s/it][I 2025-09-20 20:59:59,579] Trial 36 finished with value: 0.4594343100788162 and parameters: {'bagging_fraction': 0.9898496126986905, 'bagging_freq': 7}. Best is trial 35 with value: 0.45817498825190117.
bagging, val_score: 0.457726: 100%|##########| 10/10 [00:14<00:00,  1.40s/it]


Early stopping, best iteration is:
[78]	valid_0's binary_logloss: 0.414748	valid_1's binary_logloss: 0.459434


feature_fraction_stage2, val_score: 0.457726:   0%|          | 0/3 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds


feature_fraction_stage2, val_score: 0.457506:  33%|###3      | 1/3 [00:01<00:03,  1.50s/it][I 2025-09-20 21:00:01,088] Trial 37 finished with value: 0.4575064097631174 and parameters: {'feature_fraction': 0.44800000000000006}. Best is trial 37 with value: 0.4575064097631174.
feature_fraction_stage2, val_score: 0.457506:  33%|###3      | 1/3 [00:01<00:03,  1.50s/it][I 2025-09-20 21:00:01,088] Trial 37 finished with value: 0.4575064097631174 and parameters: {'feature_fraction': 0.44800000000000006}. Best is trial 37 with value: 0.4575064097631174.
feature_fraction_stage2, val_score: 0.457506:  33%|###3      | 1/3 [00:01<00:03,  1.50s/it]

[100]	valid_0's binary_logloss: 0.403607	valid_1's binary_logloss: 0.457772
Early stopping, best iteration is:
[114]	valid_0's binary_logloss: 0.397421	valid_1's binary_logloss: 0.457506
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM

feature_fraction_stage2, val_score: 0.456849:  67%|######6   | 2/3 [00:02<00:01,  1.45s/it][I 2025-09-20 21:00:02,505] Trial 38 finished with value: 0.4568493967701288 and parameters: {'feature_fraction': 0.48000000000000004}. Best is trial 38 with value: 0.4568493967701288.
feature_fraction_stage2, val_score: 0.456849:  67%|######6   | 2/3 [00:02<00:01,  1.45s/it][I 2025-09-20 21:00:02,505] Trial 38 finished with value: 0.4568493967701288 and parameters: {'feature_fraction': 0.48000000000000004}. Best is trial 38 with value: 0.4568493967701288.
feature_fraction_stage2, val_score: 0.456849:  67%|######6   | 2/3 [00:02<00:01,  1.45s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

feature_fraction_stage2, val_score: 0.456849: 100%|##########| 3/3 [00:04<00:00,  1.44s/it][I 2025-09-20 21:00:03,924] Trial 39 finished with value: 0.4584895937021888 and parameters: {'feature_fraction': 0.41600000000000004}. Best is trial 38 with value: 0.4568493967701288.
feature_fraction_stage2, val_score: 0.456849: 100%|##########| 3/3 [00:04<00:00,  1.45s/it]
feature_fraction_stage2, val_score: 0.456849: 100%|##########| 3/3 [00:04<00:00,  1.44s/it][I 2025-09-20 21:00:03,924] Trial 39 finished with value: 0.4584895937021888 and parameters: {'feature_fraction': 0.41600000000000004}. Best is trial 38 with value: 0.4568493967701288.
feature_fraction_stage2, val_score: 0.456849: 100%|##########| 3/3 [00:04<00:00,  1.45s/it]


[100]	valid_0's binary_logloss: 0.403773	valid_1's binary_logloss: 0.458627
Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.407391	valid_1's binary_logloss: 0.45849


regularization_factors, val_score: 0.456849:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.456849:   5%|5         | 1/20 [00:01<00:27,  1.47s/it][I 2025-09-20 21:00:05,399] Trial 40 finished with value: 0.4568493965991526 and parameters: {'lambda_l1': 4.456674336746826e-07, 'lambda_l2': 4.2917482574824355e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:   5%|5         | 1/20 [00:01<00:27,  1.47s/it][I 2025-09-20 21:00:05,399] Trial 40 finished with value: 0.4568493965991526 and parameters: {'lambda_l1': 4.456674336746826e-07, 'lambda_l2': 4.2917482574824355e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:   5%|5         | 1/20 [00:01<00:27,  1.47s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  10%|#         | 2/20 [00:02<00:26,  1.47s/it][I 2025-09-20 21:00:06,865] Trial 41 finished with value: 0.45684972173480354 and parameters: {'lambda_l1': 1.1183531548969373e-07, 'lambda_l2': 1.8251521766817335e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  10%|#         | 2/20 [00:02<00:26,  1.47s/it][I 2025-09-20 21:00:06,865] Trial 41 finished with value: 0.45684972173480354 and parameters: {'lambda_l1': 1.1183531548969373e-07, 'lambda_l2': 1.8251521766817335e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  10%|#         | 2/20 [00:02<00:26,  1.47s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.45685
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

regularization_factors, val_score: 0.456849:  15%|#5        | 3/20 [00:04<00:25,  1.47s/it][I 2025-09-20 21:00:08,343] Trial 42 finished with value: 0.45684939670776487 and parameters: {'lambda_l1': 1.455032697305869e-07, 'lambda_l2': 1.6247733464520895e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  15%|#5        | 3/20 [00:04<00:25,  1.47s/it][I 2025-09-20 21:00:08,343] Trial 42 finished with value: 0.45684939670776487 and parameters: {'lambda_l1': 1.455032697305869e-07, 'lambda_l2': 1.6247733464520895e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  15%|#5        | 3/20 [00:04<00:25,  1.47s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010800 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010800 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  20%|##        | 4/20 [00:05<00:23,  1.47s/it][I 2025-09-20 21:00:09,820] Trial 43 finished with value: 0.45684939672780506 and parameters: {'lambda_l1': 8.757761439044632e-08, 'lambda_l2': 1.1068300320940581e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  20%|##        | 4/20 [00:05<00:23,  1.47s/it][I 2025-09-20 21:00:09,820] Trial 43 finished with value: 0.45684939672780506 and parameters: {'lambda_l1': 8.757761439044632e-08, 'lambda_l2': 1.1068300320940581e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  20%|##        | 4/20 [00:05<00:23,  1.47s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010658 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010658 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  25%|##5       | 5/20 [00:07<00:21,  1.47s/it][I 2025-09-20 21:00:11,270] Trial 44 finished with value: 0.45684939671595515 and parameters: {'lambda_l1': 1.6699915501672861e-07, 'lambda_l2': 1.2562409992184724e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  25%|##5       | 5/20 [00:07<00:21,  1.47s/it][I 2025-09-20 21:00:11,270] Trial 44 finished with value: 0.45684939671595515 and parameters: {'lambda_l1': 1.6699915501672861e-07, 'lambda_l2': 1.2562409992184724e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  25%|##5       | 5/20 [00:07<00:21,  1.47s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  30%|###       | 6/20 [00:08<00:20,  1.46s/it][I 2025-09-20 21:00:12,733] Trial 45 finished with value: 0.45684939672036934 and parameters: {'lambda_l1': 1.2782038588447703e-07, 'lambda_l2': 1.2417127030063048e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  30%|###       | 6/20 [00:08<00:20,  1.46s/it][I 2025-09-20 21:00:12,733] Trial 45 finished with value: 0.45684939672036934 and parameters: {'lambda_l1': 1.2782038588447703e-07, 'lambda_l2': 1.2417127030063048e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  30%|###       | 6/20 [00:08<00:20,  1.46s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  35%|###5      | 7/20 [00:10<00:19,  1.46s/it][I 2025-09-20 21:00:14,198] Trial 46 finished with value: 0.45684972174684185 and parameters: {'lambda_l1': 1.4230612781878957e-07, 'lambda_l2': 1.326063411135187e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  35%|###5      | 7/20 [00:10<00:19,  1.46s/it][I 2025-09-20 21:00:14,198] Trial 46 finished with value: 0.45684972174684185 and parameters: {'lambda_l1': 1.4230612781878957e-07, 'lambda_l2': 1.326063411135187e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  35%|###5      | 7/20 [00:10<00:19,  1.46s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.45685
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010832 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010832 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

regularization_factors, val_score: 0.456849:  40%|####      | 8/20 [00:11<00:17,  1.47s/it][I 2025-09-20 21:00:15,666] Trial 47 finished with value: 0.45684939672030045 and parameters: {'lambda_l1': 1.2870120549530697e-07, 'lambda_l2': 1.264497484005355e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  40%|####      | 8/20 [00:11<00:17,  1.47s/it][I 2025-09-20 21:00:15,666] Trial 47 finished with value: 0.45684939672030045 and parameters: {'lambda_l1': 1.2870120549530697e-07, 'lambda_l2': 1.264497484005355e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  40%|####      | 8/20 [00:11<00:17,  1.47s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  45%|####5     | 9/20 [00:13<00:16,  1.46s/it][I 2025-09-20 21:00:17,108] Trial 48 finished with value: 0.4568493966687069 and parameters: {'lambda_l1': 2.3256093228807562e-07, 'lambda_l2': 2.6933612800349865e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  45%|####5     | 9/20 [00:13<00:16,  1.46s/it][I 2025-09-20 21:00:17,108] Trial 48 finished with value: 0.4568493966687069 and parameters: {'lambda_l1': 2.3256093228807562e-07, 'lambda_l2': 2.6933612800349865e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  45%|####5     | 9/20 [00:13<00:16,  1.46s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  50%|#####     | 10/20 [00:14<00:14,  1.48s/it][I 2025-09-20 21:00:18,625] Trial 49 finished with value: 0.45684972165918597 and parameters: {'lambda_l1': 4.599001856586292e-07, 'lambda_l2': 3.2685690159155535e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  50%|#####     | 10/20 [00:14<00:14,  1.48s/it][I 2025-09-20 21:00:18,625] Trial 49 finished with value: 0.45684972165918597 and parameters: {'lambda_l1': 4.599001856586292e-07, 'lambda_l2': 3.2685690159155535e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  50%|#####     | 10/20 [00:14<00:14,  1.48s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.45685
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

regularization_factors, val_score: 0.456849:  55%|#####5    | 11/20 [00:16<00:13,  1.49s/it][I 2025-09-20 21:00:20,135] Trial 50 finished with value: 0.4583570852966427 and parameters: {'lambda_l1': 1.627309013582128e-05, 'lambda_l2': 2.5356792654238476e-05}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  55%|#####5    | 11/20 [00:16<00:13,  1.49s/it][I 2025-09-20 21:00:20,135] Trial 50 finished with value: 0.4583570852966427 and parameters: {'lambda_l1': 1.627309013582128e-05, 'lambda_l2': 2.5356792654238476e-05}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  55%|#####5    | 11/20 [00:16<00:13,  1.49s/it]

Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.411516	valid_1's binary_logloss: 0.458357
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  60%|######    | 12/20 [00:17<00:11,  1.48s/it][I 2025-09-20 21:00:21,608] Trial 51 finished with value: 0.4568497217617327 and parameters: {'lambda_l1': 1.2215870025363202e-07, 'lambda_l2': 8.609975143800277e-08}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  60%|######    | 12/20 [00:17<00:11,  1.48s/it][I 2025-09-20 21:00:21,608] Trial 51 finished with value: 0.4568497217617327 and parameters: {'lambda_l1': 1.2215870025363202e-07, 'lambda_l2': 8.609975143800277e-08}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  60%|######    | 12/20 [00:17<00:11,  1.48s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.45685
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

regularization_factors, val_score: 0.456849:  65%|######5   | 13/20 [00:19<00:10,  1.48s/it][I 2025-09-20 21:00:23,070] Trial 52 finished with value: 0.4572909381270614 and parameters: {'lambda_l1': 0.10039505779909237, 'lambda_l2': 4.432166854026596e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  65%|######5   | 13/20 [00:19<00:10,  1.48s/it][I 2025-09-20 21:00:23,070] Trial 52 finished with value: 0.4572909381270614 and parameters: {'lambda_l1': 0.10039505779909237, 'lambda_l2': 4.432166854026596e-07}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  65%|######5   | 13/20 [00:19<00:10,  1.48s/it]

Early stopping, best iteration is:
[84]	valid_0's binary_logloss: 0.41062	valid_1's binary_logloss: 0.457291
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

regularization_factors, val_score: 0.456849:  70%|#######   | 14/20 [00:20<00:08,  1.47s/it][I 2025-09-20 21:00:24,534] Trial 53 finished with value: 0.458239014344573 and parameters: {'lambda_l1': 1.96682269902887e-06, 'lambda_l2': 0.2840050553440961}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  70%|#######   | 14/20 [00:20<00:08,  1.47s/it][I 2025-09-20 21:00:24,534] Trial 53 finished with value: 0.458239014344573 and parameters: {'lambda_l1': 1.96682269902887e-06, 'lambda_l2': 0.2840050553440961}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  70%|#######   | 14/20 [00:20<00:08,  1.47s/it]

Early stopping, best iteration is:
[71]	valid_0's binary_logloss: 0.4173	valid_1's binary_logloss: 0.458239
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores d

regularization_factors, val_score: 0.456849:  75%|#######5  | 15/20 [00:22<00:07,  1.48s/it][I 2025-09-20 21:00:26,031] Trial 54 finished with value: 0.4568493967627082 and parameters: {'lambda_l1': 1.5039924061505716e-08, 'lambda_l2': 1.644645577757096e-08}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  75%|#######5  | 15/20 [00:22<00:07,  1.48s/it][I 2025-09-20 21:00:26,031] Trial 54 finished with value: 0.4568493967627082 and parameters: {'lambda_l1': 1.5039924061505716e-08, 'lambda_l2': 1.644645577757096e-08}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  75%|#######5  | 15/20 [00:22<00:07,  1.48s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  80%|########  | 16/20 [00:23<00:05,  1.47s/it][I 2025-09-20 21:00:27,479] Trial 55 finished with value: 0.45835905945315025 and parameters: {'lambda_l1': 3.3196223726977127e-06, 'lambda_l2': 8.747511703832065e-06}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  80%|########  | 16/20 [00:23<00:05,  1.47s/it][I 2025-09-20 21:00:27,479] Trial 55 finished with value: 0.45835905945315025 and parameters: {'lambda_l1': 3.3196223726977127e-06, 'lambda_l2': 8.747511703832065e-06}. Best is trial 40 with value: 0.4568493965991526.
regularization_factors, val_score: 0.456849:  80%|########  | 16/20 [00:23<00:05,  1.47s/it]

Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.411516	valid_1's binary_logloss: 0.458359
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  85%|########5 | 17/20 [00:25<00:04,  1.49s/it][I 2025-09-20 21:00:29,015] Trial 56 finished with value: 0.45684939623256426 and parameters: {'lambda_l1': 1.1255918159686296e-08, 'lambda_l2': 1.807674829570502e-06}. Best is trial 56 with value: 0.45684939623256426.
[I 2025-09-20 21:00:29,015] Trial 56 finished with value: 0.45684939623256426 and parameters: {'lambda_l1': 1.1255918159686296e-08, 'lambda_l2': 1.807674829570502e-06}. Best is trial 56 with value: 0.45684939623256426.
regularization_factors, val_score: 0.456849:  85%|########5 | 17/20 [00:25<00:04,  1.49s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  90%|######### | 18/20 [00:26<00:02,  1.50s/it][I 2025-09-20 21:00:30,530] Trial 57 finished with value: 0.45684939478437014 and parameters: {'lambda_l1': 1.6763892487401544e-08, 'lambda_l2': 6.687961607079334e-06}. Best is trial 57 with value: 0.45684939478437014.
regularization_factors, val_score: 0.456849:  90%|######### | 18/20 [00:26<00:02,  1.50s/it][I 2025-09-20 21:00:30,530] Trial 57 finished with value: 0.45684939478437014 and parameters: {'lambda_l1': 1.6763892487401544e-08, 'lambda_l2': 6.687961607079334e-06}. Best is trial 57 with value: 0.45684939478437014.
regularization_factors, val_score: 0.456849:  90%|######### | 18/20 [00:26<00:02,  1.50s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

regularization_factors, val_score: 0.456849:  95%|#########5| 19/20 [00:28<00:01,  1.51s/it][I 2025-09-20 21:00:32,071] Trial 58 finished with value: 0.4568497193637779 and parameters: {'lambda_l1': 1.2789135888644512e-08, 'lambda_l2': 8.220997140427488e-06}. Best is trial 57 with value: 0.45684939478437014.
regularization_factors, val_score: 0.456849:  95%|#########5| 19/20 [00:28<00:01,  1.51s/it][I 2025-09-20 21:00:32,071] Trial 58 finished with value: 0.4568497193637779 and parameters: {'lambda_l1': 1.2789135888644512e-08, 'lambda_l2': 8.220997140427488e-06}. Best is trial 57 with value: 0.45684939478437014.
regularization_factors, val_score: 0.456849:  95%|#########5| 19/20 [00:28<00:01,  1.51s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.45685
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores 

regularization_factors, val_score: 0.456849: 100%|##########| 20/20 [00:29<00:00,  1.50s/it][I 2025-09-20 21:00:33,542] Trial 59 finished with value: 0.45684939586921186 and parameters: {'lambda_l1': 1.8936479948880363e-08, 'lambda_l2': 3.036219536968878e-06}. Best is trial 57 with value: 0.45684939478437014.
regularization_factors, val_score: 0.456849: 100%|##########| 20/20 [00:29<00:00,  1.48s/it]
regularization_factors, val_score: 0.456849: 100%|##########| 20/20 [00:29<00:00,  1.50s/it][I 2025-09-20 21:00:33,542] Trial 59 finished with value: 0.45684939586921186 and parameters: {'lambda_l1': 1.8936479948880363e-08, 'lambda_l2': 3.036219536968878e-06}. Best is trial 57 with value: 0.45684939478437014.
regularization_factors, val_score: 0.456849: 100%|##########| 20/20 [00:29<00:00,  1.48s/it]


Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.409126	valid_1's binary_logloss: 0.456849


min_child_samples, val_score: 0.456849:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores don't improve for 10 rounds


min_child_samples, val_score: 0.456849:  20%|##        | 1/5 [00:01<00:05,  1.49s/it][I 2025-09-20 21:00:35,041] Trial 60 finished with value: 0.45914302756584013 and parameters: {'min_child_samples': 5}. Best is trial 60 with value: 0.45914302756584013.
min_child_samples, val_score: 0.456849:  20%|##        | 1/5 [00:01<00:05,  1.49s/it][I 2025-09-20 21:00:35,041] Trial 60 finished with value: 0.45914302756584013 and parameters: {'min_child_samples': 5}. Best is trial 60 with value: 0.45914302756584013.
min_child_samples, val_score: 0.456849:  20%|##        | 1/5 [00:01<00:05,  1.49s/it]

[100]	valid_0's binary_logloss: 0.401796	valid_1's binary_logloss: 0.459303
Early stopping, best iteration is:
[101]	valid_0's binary_logloss: 0.401196	valid_1's binary_logloss: 0.459143
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM

min_child_samples, val_score: 0.456849:  40%|####      | 2/5 [00:02<00:04,  1.45s/it][I 2025-09-20 21:00:36,466] Trial 61 finished with value: 0.45778317000772445 and parameters: {'min_child_samples': 50}. Best is trial 61 with value: 0.45778317000772445.
min_child_samples, val_score: 0.456849:  40%|####      | 2/5 [00:02<00:04,  1.45s/it][I 2025-09-20 21:00:36,466] Trial 61 finished with value: 0.45778317000772445 and parameters: {'min_child_samples': 50}. Best is trial 61 with value: 0.45778317000772445.
min_child_samples, val_score: 0.456849:  40%|####      | 2/5 [00:02<00:04,  1.45s/it]

Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.414636	valid_1's binary_logloss: 0.457783
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

min_child_samples, val_score: 0.456849:  60%|######    | 3/5 [00:04<00:02,  1.46s/it][I 2025-09-20 21:00:37,934] Trial 62 finished with value: 0.4577593775857837 and parameters: {'min_child_samples': 25}. Best is trial 62 with value: 0.4577593775857837.
min_child_samples, val_score: 0.456849:  60%|######    | 3/5 [00:04<00:02,  1.46s/it][I 2025-09-20 21:00:37,934] Trial 62 finished with value: 0.4577593775857837 and parameters: {'min_child_samples': 25}. Best is trial 62 with value: 0.4577593775857837.
min_child_samples, val_score: 0.456849:  60%|######    | 3/5 [00:04<00:02,  1.46s/it]

Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.411892	valid_1's binary_logloss: 0.457759
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011776 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011776 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

min_child_samples, val_score: 0.456849:  80%|########  | 4/5 [00:05<00:01,  1.45s/it][I 2025-09-20 21:00:39,370] Trial 63 finished with value: 0.4589707322123357 and parameters: {'min_child_samples': 10}. Best is trial 62 with value: 0.4577593775857837.
[I 2025-09-20 21:00:39,370] Trial 63 finished with value: 0.4589707322123357 and parameters: {'min_child_samples': 10}. Best is trial 62 with value: 0.4577593775857837.
min_child_samples, val_score: 0.456849:  80%|########  | 4/5 [00:05<00:01,  1.45s/it]

Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.415867	valid_1's binary_logloss: 0.458971
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] Number of positive: 13851, number of negative: 49890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37657
[LightGBM] [Info] Number of data points in the train set: 63741, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217301 -> initscore=-1.281463
[LightGBM] [Info] Start training from score -1.281463
Training until validation scores

min_child_samples, val_score: 0.456849: 100%|##########| 5/5 [00:07<00:00,  1.47s/it][I 2025-09-20 21:00:40,862] Trial 64 finished with value: 0.45729607530110644 and parameters: {'min_child_samples': 100}. Best is trial 64 with value: 0.45729607530110644.
min_child_samples, val_score: 0.456849: 100%|##########| 5/5 [00:07<00:00,  1.47s/it][I 2025-09-20 21:00:40,862] Trial 64 finished with value: 0.45729607530110644 and parameters: {'min_child_samples': 100}. Best is trial 64 with value: 0.45729607530110644.
min_child_samples, val_score: 0.456849: 100%|##########| 5/5 [00:07<00:00,  1.46s/it]

Early stopping, best iteration is:
[88]	valid_0's binary_logloss: 0.410604	valid_1's binary_logloss: 0.457296



[WinError 2] 指定されたファイルが見つかりません。
  File "c:\Users\koxyg\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\koxyg\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\koxyg\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\koxyg\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[WinError 2] 指定されたファイルが見つかりません。
  File "c:\Users\koxyg\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.

[LightGBM] [Info] Number of positive: 19780, number of negative: 70451
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37906
[LightGBM] [Info] Number of data points in the train set: 90231, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.219215 -> initscore=-1.270246
[LightGBM] [Info] Start training from score -1.270246
[LightGBM] [Info] Number of positive: 19780, number of negative: 70451
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37906
[LightGBM] [Info] Number of data points in the train set: 90231, number of used features: 222
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.219215 -> initscore=-1.270246
[LightGBM] [Info] Start training from score -1.270246
AUC: 0.8

In [37]:
#特徴量の重要度
keiba_ai.feature_importance()

Unnamed: 0,features,importance
184,age_days,93
10,n_horses,89
4,jockey_id,87
8,体重,86
183,interval,69
85,賞金_allR,55
120,breeder_id,53
6,owner_id,43
263,race_class_未勝利,42
14,着差_5R,40


In [38]:
#ハイパーパラメータの確認
keiba_ai.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'feature_pre_filter': False,
 'lambda_l1': 1.6763892487401544e-08,
 'lambda_l2': 6.687961607079334e-06,
 'feature_fraction': 0.48000000000000004,
 'bagging_fraction': 1.0,
 'bagging_freq': 0}

In [None]:
#チューニングしないで学習
#keiba_ai.train_without_tuning()

In [39]:
#モデル保存。models/(実行した日付)/(version_name).pickleに、モデルとデータセットが保存される。
training.KeibaAIFactory.save(keiba_ai, version_name='basemodel_2024_2025')

In [40]:
#モデルロード
keiba_ai = training.KeibaAIFactory.load('models/20250920/basemodel_2024_2025.pickle')
keiba_ai.set_params(keiba_ai.get_params())

# 5. シミュレーション

In [None]:
%autoreload

In [None]:
#シミュレーターに馬券をセット
simulator = simulation.Simulator(return_processor)

In [None]:
# スコアテーブルを取得
score_table = keiba_ai.calc_score(keiba_ai.datasets.X_test, policies.StdScorePolicy)

## 5.1. 単一threshold

### 5.1.1 単勝馬券

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyTansho, # 賭け方の方針
                threshold=threshold #「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
#シミュレーション結果も、models/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20250920/tansho.pickle')

In [None]:
#回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='tansho')

### 5.1.2 複勝馬券

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyFukusho, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20220626/fukusho.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='fukusho')

### 5.1.3 馬連BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyUmarenBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20220626/umarenbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='umarenbox')

### 5.1.4 馬単BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyUmatanBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20220626/umatanbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='umatanbox')

### 5.1.5 ワイドBOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicyWideBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20220626/widebox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='widebox')

### 5.1.6 三連複BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicySanrenpukuBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20220626/sanrenpukubox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='sanrenpukubox')

### 5.1.7 三連単BOX

In [None]:
T_RANGE = [0.5, 3.5]
N_SAMPLES = 100
returns = {}

# 「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
for i in tqdm(range(N_SAMPLES)):
    # T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold = T_RANGE[1] * i / N_SAMPLES + T_RANGE[0] * (1 - (i / N_SAMPLES))
    try:
        # 賭ける馬券を決定
        actions = keiba_ai.decide_action(
                score_table, # スコアテーブル
                policies.BetPolicySanrentanBox, # 賭け方の方針
                threshold=threshold # 「馬の勝ちやすさスコア」の閾値
                )
        returns[threshold] = simulator.calc_returns(actions)
    except Exception as e:
        print(e)
        break
returns_df = pd.DataFrame.from_dict(returns, orient='index')
returns_df.index.name = 'threshold'

In [None]:
# シミュレーション結果も、models/YYYYMMDD/に保存しておくとわかりやすい。
returns_df.to_pickle('models/20250920/sanrentanbox.pickle')

In [None]:
# 回収率をプロット
simulation.plot_single_threshold(returns_df, N_SAMPLES, label='sanrentanbox')

## 5.2. 複数馬券
未実装

## 5.3. 複数threshold
未実装だが、以下のようなコードになる予定。

In [None]:
T1_RANGE = [2.5, 3.5]
MIN_T2 = 1
N_SAMPLES = 10

returns = {}
#「馬の勝ちやすさスコア」の閾値を変化させた時の成績を計算
idx = 0
for i in tqdm(range(N_SAMPLES)):
    #T_RANGEの範囲を、N_SAMPLES等分して、thresholdをfor分で回す
    threshold1 = T1_RANGE[1] * i / N_SAMPLES + T1_RANGE[0] * (1-(i/N_SAMPLES))
    for j in range(N_SAMPLES):
        #MIN_T2からthreshold1までをN_SAMPLES等分
        threshold2 = threshold1 * j / N_SAMPLES + MIN_T2 * (1-(j/N_SAMPLES))
        try:
            #print(threshold1, threshold2)
            #賭ける馬券を決定
            actions = keiba_ai.decide_action(
                    score_table, # スコアテーブル
                    policies.BetPolicyTanshoFukusho, # 賭け方の方針(未実装)
                    threshold1=threshold1, #「馬の勝ちやすさスコア」の閾値
                    threshold2=threshold2
                    )
            returns[idx] = simulator.calc_returns(actions)
            idx += 1
        except Exception as e:
            print(e)
            break
returns_df = pd.DataFrame.from_dict(returns, orient='index')

In [None]:
simulation.plot_single_threshold(returns_df.reset_index(), 100, label='tansho_fukusho')

# 6. 当日の予想
例として2022年1月8日のレースを実際に予想する場合を考える。  
https://race.netkeiba.com/top/race_list.html?kaisai_date=20220108

## 6.1. 前日準備

In [31]:
%autoreload

In [32]:
race_id_list = preparing.scrape_race_id_list(['20250921']) #レースidを取得
len(race_id_list)

getting race_id_list


  0%|          | 0/1 [00:00<?, ?it/s]

scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20250921


24

In [33]:
#出走するhorse_idの取得
horse_id_list = preparing.scrape_horse_id_list(race_id_list)
len(horse_id_list)

sraping horse_id_list


  0%|          | 0/24 [00:00<?, ?it/s]

302

In [34]:
#horseページのhtmlをスクレイピング
#直近レースが更新されている可能性があるので、skip=Falseにして上書きする
html_files_horse = preparing.scrape_html_horse_with_master(horse_id_list, skip=False)

scraping


  0%|          | 0/302 [00:00<?, ?it/s]

updating master


In [35]:
#horse_infoテーブルの更新
horse_info_20250920 = preparing.get_rawdata_horse_info(html_files_horse)
preparing.update_rawdata(LocalPaths.RAW_HORSE_INFO_PATH, horse_info_20250920)

preparing raw horse_info table


  0%|          | 0/302 [00:00<?, ?it/s]

In [36]:
#horse_resultsテーブルの更新
horse_results_20250920 = preparing.get_rawdata_horse_results(html_files_horse)
preparing.update_rawdata(LocalPaths.RAW_HORSE_RESULTS_PATH, horse_results_20250920)

preparing raw horse_results table


  0%|          | 0/302 [00:00<?, ?it/s]

horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100091.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023100941.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023107162.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023107237.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023102621.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023105452.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023105506.bin
horse_results insufficient tables: 1 tables in c:\Users\koxyg\Documents\GitHub\MyKeiba-AI_v2\data\html\horse\2023105963.bin
horse_re

In [37]:
#pedsテーブルの更新
html_files_peds = preparing.scrape_html_ped(horse_id_list, skip=True)
peds_20250920 = preparing.get_rawdata_peds(html_files_peds)
preparing.update_rawdata(LocalPaths.RAW_PEDS_PATH, peds_20250920)

  0%|          | 0/302 [00:00<?, ?it/s]

horse_id 2023106829 skipped
horse_id 2023100895 skipped
horse_id 2023104052 skipped
horse_id 2023103073 skipped
horse_id 2023105820 skipped
horse_id 2023106493 skipped
horse_id 2023100044 skipped
horse_id 2023104232 skipped
horse_id 2023102401 skipped
horse_id 2023100374 skipped
horse_id 2023107353 skipped
horse_id 2023102103 skipped
horse_id 2023104971 skipped
horse_id 2023102122 skipped
horse_id 2023103694 skipped
horse_id 2023102155 skipped
horse_id 2023106601 skipped
horse_id 2023101471 skipped
horse_id 2023102819 skipped
horse_id 2023100792 skipped
horse_id 2023105542 skipped
horse_id 2023104074 skipped
horse_id 2023102342 skipped
horse_id 2023101676 skipped
horse_id 2023105713 skipped
horse_id 2023100888 skipped
horse_id 2023106258 skipped
horse_id 2023102803 skipped
horse_id 2023105932 skipped
horse_id 2023100723 skipped
horse_id 2023105751 skipped
horse_id 2023109039 skipped
horse_id 2023102426 skipped
horse_id 2023104303 skipped
horse_id 2023107377 skipped
horse_id 2023100940 

  0%|          | 0/44 [00:00<?, ?it/s]

In [38]:
#processorの更新
horse_info_processor = preprocessing.HorseInfoProcessor(
    filepath=LocalPaths.RAW_HORSE_INFO_PATH)
horse_results_processor = preprocessing.HorseResultsProcessor(
    filepath=LocalPaths.RAW_HORSE_RESULTS_PATH)
peds_processor = preprocessing.PedsProcessor(filepath=LocalPaths.RAW_PEDS_PATH)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[Cols.PRIZE].fillna(0, inplace=True)


In [39]:
#モデルの準備
keiba_ai = training.KeibaAIFactory.load('models/20250920/basemodel_2024_2025.pickle')

## 6.2. 前日全レース予想

In [41]:
%autoreload

In [42]:
# 前日全レース予想用のレースidとレース発走時刻を取得
target_race_id_list, target_race_time_list = preparing.scrape_race_id_race_time_list('20250921')
print(len(target_race_id_list))
print(len(target_race_time_list))
yesterday = True

getting race_id_list
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20250921
24
24
24
24


In [55]:
# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
today = '2025/09/20'

for race_id, race_time in zip(target_race_id_list, target_race_time_list):
    # 出馬表の取得
    preparing.scrape_shutuba_table(race_id, today, filepath)

    # 前日予想の場合
    if yesterday:
        # 前日予想の場合、馬体重を0（0）に補正
        pd2 = pd.read_pickle(filepath)
        pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
        # 前日予想の場合、天候と馬場状態が公開されていない場合はこちらを有効にする
        #pd2['weather'] = '晴'
        #pd2['ground_state'] = '良'
        pd2.to_pickle(filepath)

    # 出馬表の加工
    shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)

    # テーブルのマージ
    shutuba_data_merger = preprocessing.ShutubaDataMerger(
        shutuba_table_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
    )
    shutuba_data_merger.merge()

    # 特徴量エンジニアリング
    feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
        .add_interval()\
        .add_agedays()\
        .dumminize_ground_state()\
        .dumminize_race_type()\
        .dumminize_sex()\
        .dumminize_weather()\
        .encode_horse_id()\
        .encode_jockey_id()\
        .encode_trainer_id()\
        .encode_owner_id()\
        .encode_breeder_id()\
        .dumminize_kaisai()\
        .dumminize_around()\
        .dumminize_race_class()

    # 予測
    X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

    # 当日の出走情報テーブル（前処理前）
    df_tmp = shutuba_table_processor.raw_data[:1]

    i = 0
    for num in list(Master.PLACE_DICT.values()):
        if num == race_id[4:6]:
            print(list(Master.PLACE_DICT)[i] + race_id[10:12] + 'R ' + race_time + '発走 ' + str(df_tmp.iat[0, 12])
                + str(df_tmp.iat[0, 10]) + 'm ' + str(df_tmp.iat[0, 13]) + ' ' + str(df_tmp.iat[0, 15]))
            break
        i += 1

    print(keiba_ai.calc_score(X, policies.StdScorePolicy).sort_values('score', ascending=False))

separating horse results by date


  0%|          | 0/1 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/1 [00:00<?, ?it/s]

KeyError: "Columns not found: 'n_horses', 'ground_state', 'owner_id', '単勝', '年齢', '性', '体重変化', 'trainer_id', 'race_class', 'jockey_id', '体重', 'breeder_id', 'around', 'horse_id', 'weather'"

## 6.3. レース直前データ処理（当日レース予想）

In [4]:
%autoreload

In [5]:
# 馬体重の発表されたレースID、レース時刻を取得（レース当日用）
target_race_id_list, target_race_time_list = preparing.create_active_race_id_list()

# レース時刻順にソート
race_data = list(zip(target_race_id_list, target_race_time_list))
race_data_sorted = sorted(race_data, key=lambda x: x[1])  # 時刻でソート
target_race_id_list = [race_id for race_id, race_time in race_data_sorted]
target_race_time_list = [race_time for race_id, race_time in race_data_sorted]

print("ソート後のレースID:", target_race_id_list)
print("ソート後のレース時刻:", target_race_time_list)

20250920 16:20
getting race_id_list
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20250920
ソート後のレースID: ['202506040612']
ソート後のレース時刻: ['16:30']


In [6]:
# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
#today = '2022/10/01'
today = datetime.datetime.now().date().strftime('%Y/%m/%d')

for race_id, race_time in zip(target_race_id_list, target_race_time_list):
    # 出馬表の取得
    preparing.scrape_shutuba_table(race_id, today, filepath)

    # 出馬表の加工
    shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath)

    # テーブルのマージ
    shutuba_data_merger = preprocessing.ShutubaDataMerger(
        shutuba_table_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
    )
    shutuba_data_merger.merge()

    # 特徴量エンジニアリング
    feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger) \
        .add_interval()\
        .add_agedays()\
        .dumminize_ground_state()\
        .dumminize_race_type()\
        .dumminize_sex()\
        .dumminize_weather()\
        .encode_horse_id()\
        .encode_jockey_id()\
        .encode_trainer_id()\
        .encode_owner_id()\
        .encode_breeder_id()\
        .dumminize_kaisai()\
        .dumminize_around()\
        .dumminize_race_class()

    # 予測
    X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

    # 当日の出走情報テーブル（前処理前）
    df_tmp = shutuba_table_processor.raw_data[:1]
    #df_tmp['race_type'] tmp.iat[0, 12]
    #df_tmp['around'] 13
    #df_tmp['weather'] 14
    #df_tmp['ground_state'] 15
    #df_tmp['race_class']16

    i = 0
    for num in list(Master.PLACE_DICT.values()):
        if num == race_id[4:6]:
            print(list(Master.PLACE_DICT)[i] + race_id[10:12] + 'R ' + race_time + '発走 ' + str(df_tmp.iat[0, 12])
                + str(df_tmp.iat[0, 10]) + 'm ' + str(df_tmp.iat[0, 13]) + ' ' + str(df_tmp.iat[0, 15]))
            break
        i += 1

    print(keiba_ai.calc_score(X, policies.StdScorePolicy).sort_values('score', ascending=False))

ValueError: invalid literal for int() with base 10: 'an'

## 6.4. レース直前データ処理（旧方式）

In [9]:
filepath = 'data/tmp/shutuba.pickle' #一時的に出馬表を保存するパスを指定
preparing.scrape_shutuba_table(race_id_list[0], '2025/9/20', filepath) #馬体重が発表されたら、出馬表を取得
shutuba_table_processor = preprocessing.ShutubaTableProcessor(filepath) #出馬表の加工

ValueError: invalid literal for int() with base 10: 'an'

In [None]:
#テーブルのマージ
shutuba_data_merger = preprocessing.ShutubaDataMerger(
    shutuba_table_processor,
    horse_results_processor,
    horse_info_processor,
    peds_processor,
    target_cols=TARGET_COLS,
    group_cols=GROUP_COLS
)

shutuba_data_merger.merge()

In [None]:
#特徴量エンジニアリング
feature_enginnering_shutuba = preprocessing.FeatureEngineering(shutuba_data_merger)\
    .add_interval()\
    .add_agedays()\
    .dumminize_ground_state()\
    .dumminize_race_type()\
    .dumminize_sex()\
    .dumminize_weather()\
    .encode_horse_id()\
    .encode_jockey_id()\
    .encode_trainer_id()\
    .encode_owner_id()\
    .encode_breeder_id()\
    .dumminize_kaisai()\
    .dumminize_around()\
    .dumminize_race_class()

In [None]:
#予測
X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)
keiba_ai.calc_score(X, policies.StdScorePolicy).sort_values('score', ascending=False)

In [21]:
# モジュールを強制再ロード
%reload_ext autoreload
import importlib
from modules.preprocessing import _results_processor, _shutuba_table_processor
importlib.reload(_results_processor)
importlib.reload(_shutuba_table_processor)

<module 'modules.preprocessing._shutuba_table_processor' from 'c:\\Users\\koxyg\\Documents\\GitHub\\MyKeiba-AI_v2\\modules\\preprocessing\\_shutuba_table_processor.py'>

In [23]:
# カーネルリスタート後の初期化
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import modules.constants as constants
from modules.constants import LocalPaths
from modules.constants import ResultsCols
from modules.constants import Master

import modules.preprocessing as preprocessing
import modules.policies as policies
import modules.training as training
import modules.preparing as preparing

TARGET_COLS = ['開催', 'horse_id', 'jockey_id', 'trainer_id', 'owner_id', 'breeder_id', '性', '年齢', '体重', '体重変化', 'course_len', 'weather', 'race_type', 'ground_state', 'around', 'race_class', 'n_horses', '単勝', '斤量', '枠番', '馬番']
GROUP_COLS = ['race_id', 'date']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
# キャッシュされたモジュールを削除
import sys
modules_to_remove = []
for module_name in sys.modules:
    if module_name.startswith('modules.preprocessing'):
        modules_to_remove.append(module_name)

for module_name in modules_to_remove:
    del sys.modules[module_name]

print(f"削除したモジュール: {modules_to_remove}")

# 再インポート
import modules.preprocessing as preprocessing
print("モジュールが再インポートされました")

削除したモジュール: ['modules.preprocessing._abstract_data_processor', 'modules.preprocessing._horse_results_processor', 'modules.preprocessing._horse_info_processor', 'modules.preprocessing._peds_processor', 'modules.preprocessing._race_info_processor', 'modules.preprocessing._results_processor', 'modules.preprocessing._data_merger', 'modules.preprocessing._feature_engineering', 'modules.preprocessing._return_processor', 'modules.preprocessing._shutuba_table_processor', 'modules.preprocessing._shutuba_data_merger', 'modules.preprocessing']
モジュールが再インポートされました


[autoreload of modules.preprocessing._shutuba_table_processor failed: Traceback (most recent call last):
  File "c:\Users\koxyg\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\extensions\autoreload.py", line 280, in check
    elif self.deduper_reloader.maybe_reload_module(m):
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\koxyg\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\extensions\deduperreload\deduperreload.py", line 533, in maybe_reload_module
    new_source_code = f.read()
                      ^^^^^^^^
UnicodeDecodeError: 'cp932' codec can't decode byte 0x87 in position 356: illegal multibyte sequence
]
