# Programs V2: 改善版着順予想モデル

全改善案を盛り込んだ着順予想モデル。

**改善点:**
- LightGBM LambdaRank (ランキング学習)
- LightGBM Classifier
- 今節成績の特徴量化
- レース内相対特徴量
- コース特徴量
- 選手履歴統計 (リーク防止)
- Optuna ハイパーパラメータ最適化
- アンサンブル (3モデル)

In [1]:
import subprocess
import sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm', 'optuna', 'ipywidgets'])

from pathlib import Path
import calendar
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
import pickle
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

print('Setup complete')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Setup complete


## 定数・ユーティリティ

In [2]:
STADIUM_NAME_TO_NUMBER = {
    'ボートレース桐生': 1, 'ボートレース戸田': 2, 'ボートレース江戸川': 3,
    'ボートレース平和島': 4, 'ボートレース多摩川': 5, 'ボートレース浜名湖': 6,
    'ボートレース蒲郡': 7, 'ボートレース常滑': 8, 'ボートレース津': 9,
    'ボートレース三国': 10, 'ボートレースびわこ': 11, 'ボートレース琵琶湖': 11,
    'ボートレース住之江': 12, 'ボートレース尼崎': 13, 'ボートレース鳴門': 14,
    'ボートレース丸亀': 15, 'ボートレース児島': 16, 'ボートレース宮島': 17,
    'ボートレース徳山': 18, 'ボートレース下関': 19, 'ボートレース若松': 20,
    'ボートレース芦屋': 21, 'ボートレース福岡': 22, 'ボートレース唐津': 23,
    'ボートレース大村': 24,
}

def extract_day_number(day_str):
    if pd.isna(day_str):
        return np.nan
    day_str = str(day_str)
    if '第' in day_str and '日' in day_str:
        try:
            return int(day_str.replace('第', '').replace('日', ''))
        except Exception:
            return np.nan
    return np.nan

def map_stadium_name_to_number(stadium_name):
    if pd.isna(stadium_name):
        return np.nan
    return STADIUM_NAME_TO_NUMBER.get(str(stadium_name).strip(), np.nan)

print('Constants & utilities ready')

Constants & utilities ready


## データ変形関数

In [3]:
def reshape_programs(df):
    """Programs を艇単位に変形 (今節成績カラムを保持)"""
    frames = []
    race_cols = ['レースコード', '日次', 'レース日', 'レース場', 'レース回']

    for frame in range(1, 7):
        prefix = f'{frame}枠_'
        cols = [c for c in df.columns if c.startswith(prefix)]
        if cols:
            tmp = df[race_cols + cols].copy()
            tmp.columns = race_cols + [c[len(prefix):] for c in cols]
            tmp['枠'] = frame
            frames.append(tmp)

    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

def reshape_results(df):
    """Results を艇単位に変形"""
    result_list = []
    for _, row in df.iterrows():
        race_code = row['レースコード']
        for place in range(1, 7):
            boat_col = f'{place}着_艇番'
            if boat_col not in df.columns:
                continue
            boat_num = row[boat_col]
            if pd.isna(boat_num):
                continue
            try:
                boat_num = int(boat_num)
                if 1 <= boat_num <= 6:
                    result_list.append({'レースコード': race_code, '艇番': boat_num, '着順': place})
            except (ValueError, TypeError):
                continue
    return pd.DataFrame(result_list) if result_list else pd.DataFrame()

def reshape_previews(df):
    """Previews を艇単位に変形（気象列を保持）"""
    if df is None or df.empty:
        return pd.DataFrame()
    race_id_cols = ['レースコード', 'レース日', 'レース場', 'レース回']
    weather_cols = [c for c in ['風速(m)', '風向', '波の高さ(cm)', '天候', '気温(℃)', '水温(℃)'] if c in df.columns]
    preview_frames = []
    for boat_num in range(1, 7):
        boat_prefix = f'艇{boat_num}_'
        boat_cols = [col for col in df.columns if col.startswith(boat_prefix)]
        if boat_cols:
            tmp = df[race_id_cols + weather_cols + boat_cols].copy()
            rename_map = {col: col[len(boat_prefix):] for col in boat_cols}
            tmp = tmp.rename(columns=rename_map)
            tmp['艇番'] = boat_num
            preview_frames.append(tmp)
    if preview_frames:
        return pd.concat(preview_frames, ignore_index=True)
    return pd.DataFrame()

print('Data reshape functions ready')


Data reshape functions ready


## 今節成績パース (新規)

In [4]:
def compute_konseki_features(df):
    """今節成績カラム (今節成績_1-1 ~ 今節成績_6-2) から特徴量を生成。
    
    各ペア (今節成績_X-1, 今節成績_X-2) は (コース, 着順)。
    着順 0 はスタート事故 → NaN 扱い。
    """
    konseki_cols_1 = [f'今節成績_{i}-1' for i in range(1, 7)]  # コース
    konseki_cols_2 = [f'今節成績_{i}-2' for i in range(1, 7)]  # 着順

    # 存在するカラムのみ取得
    existing_place_cols = [c for c in konseki_cols_2 if c in df.columns]
    
    if not existing_place_cols:
        df['今節_平均着順'] = np.nan
        df['今節_1着回数'] = 0
        df['今節_3連対率'] = 0.0
        df['今節_出走回数'] = 0
        df['今節_最新着順'] = np.nan
        return df
    
    # 着順データを取得し、0をNaN扱い
    place_data = df[existing_place_cols].copy()
    place_data = place_data.apply(pd.to_numeric, errors='coerce')
    place_data = place_data.replace(0, np.nan)
    
    # 特徴量計算
    df['今節_平均着順'] = place_data.mean(axis=1)
    df['今節_1着回数'] = (place_data == 1).sum(axis=1)
    df['今節_3連対率'] = (place_data <= 3).sum(axis=1) / place_data.notna().sum(axis=1)
    df['今節_出走回数'] = place_data.notna().sum(axis=1)
    
    # 最新着順: 最も右にある非NaN値
    latest = np.full(len(df), np.nan)
    for col in reversed(existing_place_cols):
        vals = pd.to_numeric(df[col], errors='coerce').replace(0, np.nan)
        mask = np.isnan(latest) & vals.notna().values
        latest[mask] = vals.values[mask]
    df['今節_最新着順'] = latest
    
    return df

print('Konseki features function ready')

Konseki features function ready


### 今節成績パースのテスト

In [5]:
# テスト: 小さなデータで確認
test_df = pd.DataFrame({
    '今節成績_1-1': [1, 2],
    '今節成績_1-2': [1, 3],
    '今節成績_2-1': [2, np.nan],
    '今節成績_2-2': [3, np.nan],
    '今節成績_3-1': [np.nan, np.nan],
    '今節成績_3-2': [np.nan, np.nan],
    '今節成績_4-1': [np.nan, np.nan],
    '今節成績_4-2': [np.nan, np.nan],
    '今節成績_5-1': [np.nan, np.nan],
    '今節成績_5-2': [np.nan, np.nan],
    '今節成績_6-1': [np.nan, np.nan],
    '今節成績_6-2': [np.nan, np.nan],
})
test_df = compute_konseki_features(test_df)
print('今節_平均着順:', test_df['今節_平均着順'].tolist())
print('今節_1着回数:', test_df['今節_1着回数'].tolist())
print('今節_3連対率:', test_df['今節_3連対率'].tolist())
print('今節_出走回数:', test_df['今節_出走回数'].tolist())
print('今節_最新着順:', test_df['今節_最新着順'].tolist())

今節_平均着順: [2.0, 3.0]
今節_1着回数: [1, 0]
今節_3連対率: [1.0, 1.0]
今節_出走回数: [2, 1]
今節_最新着順: [3.0, 3.0]


## 特徴量エンジニアリング (新規)

In [6]:
# Data-driven advantage maps per stadium (derived from historical 1st-place rates)
STADIUM_ADVANTAGE_MAP = {
    1:  {1: 5.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 0.4, 6: 0.0},  # 桐生
    2:  {1: 5.0, 2: 1.7, 3: 1.5, 4: 1.2, 5: 0.3, 6: 0.0},  # 戸田
    3:  {1: 5.0, 2: 1.7, 3: 1.2, 4: 1.0, 5: 0.4, 6: 0.0},  # 江戸川
    4:  {1: 5.0, 2: 1.5, 3: 1.2, 4: 0.9, 5: 0.3, 6: 0.0},  # 平和島
    5:  {1: 5.0, 2: 1.2, 3: 1.0, 4: 0.8, 5: 0.4, 6: 0.0},  # 多摩川
    6:  {1: 5.0, 2: 1.1, 3: 1.1, 4: 0.6, 5: 0.3, 6: 0.0},  # 浜名湖
    7:  {1: 5.0, 2: 0.9, 3: 0.8, 4: 0.7, 5: 0.2, 6: 0.0},  # 蒲郡
    8:  {1: 5.0, 2: 0.8, 3: 0.6, 4: 0.6, 5: 0.2, 6: 0.0},  # 常滑
    9:  {1: 5.0, 2: 1.1, 3: 0.9, 4: 0.7, 5: 0.3, 6: 0.0},  # 津
    10: {1: 5.0, 2: 1.3, 3: 1.1, 4: 0.6, 5: 0.2, 6: 0.0},  # 三国
    11: {1: 5.0, 2: 1.2, 3: 1.1, 4: 0.8, 5: 0.5, 6: 0.0},  # びわこ
    12: {1: 5.0, 2: 1.0, 3: 0.8, 4: 0.6, 5: 0.3, 6: 0.0},  # 住之江
    13: {1: 5.0, 2: 0.8, 3: 0.9, 4: 0.6, 5: 0.2, 6: 0.0},  # 尼崎
    14: {1: 5.0, 2: 1.3, 3: 1.3, 4: 0.9, 5: 0.4, 6: 0.0},  # 鳴門
    15: {1: 5.0, 2: 1.0, 3: 0.9, 4: 0.6, 5: 0.3, 6: 0.0},  # 丸亀
    16: {1: 5.0, 2: 1.0, 3: 0.8, 4: 0.5, 5: 0.2, 6: 0.0},  # 児島
    17: {1: 5.0, 2: 1.0, 3: 0.9, 4: 0.7, 5: 0.3, 6: 0.0},  # 宮島
    18: {1: 5.0, 2: 1.0, 3: 0.6, 4: 0.5, 5: 0.1, 6: 0.0},  # 徳山
    19: {1: 5.0, 2: 0.8, 3: 0.7, 4: 0.5, 5: 0.1, 6: 0.0},  # 下関
    20: {1: 5.0, 2: 0.7, 3: 0.8, 4: 0.6, 5: 0.2, 6: 0.0},  # 若松
    21: {1: 5.0, 2: 0.6, 3: 0.6, 4: 0.6, 5: 0.2, 6: 0.0},  # 芦屋
    22: {1: 5.0, 2: 1.1, 3: 1.2, 4: 0.6, 5: 0.2, 6: 0.0},  # 福岡
    23: {1: 5.0, 2: 1.3, 3: 0.9, 4: 0.7, 5: 0.3, 6: 0.0},  # 唐津
    24: {1: 5.0, 2: 0.8, 3: 0.7, 4: 0.5, 5: 0.2, 6: 0.0},  # 大村
}

DEFAULT_ADVANTAGE_MAP = {1: 5, 2: 3, 3: 2, 4: 1, 5: 0, 6: 0}


def compute_relative_features(df):
    """レース内相対特徴量"""
    if '全国勝率' in df.columns:
        grp = df.groupby('レースコード')['全国勝率']
        df['全国勝率_偏差'] = df['全国勝率'] - grp.transform('mean')
        df['全国勝率_最大差'] = df['全国勝率'] - grp.transform('max')
    
    if 'モーター2連対率' in df.columns:
        df['モーター2連対率_順位'] = df.groupby('レースコード')['モーター2連対率'].rank(ascending=False, method='min')
    
    if '当地勝率' in df.columns:
        grp = df.groupby('レースコード')['当地勝率']
        df['当地勝率_偏差'] = df['当地勝率'] - grp.transform('mean')
    
    return df

def compute_course_features(df):
    """コース特徴量（データ駆動のレース場別イン有利度マップ使用 + 風速交互作用）"""
    # 枠×全国勝率
    if '全国勝率' in df.columns and '枠' in df.columns:
        df['枠×全国勝率'] = df['枠'] * df['全国勝率'].fillna(0)
    
    # イン有利度: レース場別データ駆動マップ
    if '枠' in df.columns:
        if 'レース場' in df.columns:
            df['イン有利度'] = df.apply(
                lambda row: STADIUM_ADVANTAGE_MAP.get(
                    row['レース場'], DEFAULT_ADVANTAGE_MAP
                ).get(row['枠'], 0) if pd.notna(row['レース場']) else DEFAULT_ADVANTAGE_MAP.get(row['枠'], 0),
                axis=1,
            )
        else:
            df['イン有利度'] = df['枠'].map(DEFAULT_ADVANTAGE_MAP).fillna(0)
    
    # Wind speed interaction features
    if '風速(m)' in df.columns and 'イン有利度' in df.columns:
        wind = pd.to_numeric(df['風速(m)'], errors='coerce').fillna(3.0)
        df['風速×イン有利度'] = wind * df['イン有利度']
        df['強風フラグ'] = (wind >= 5).astype(int)
        if '枠' in df.columns:
            df['強風×枠'] = df['強風フラグ'] * df['枠']
    
    return df

def compute_player_historical_stats(train_df):
    """選手履歴統計 (学習データのみから計算、リーク防止)"""
    if '登録番号' not in train_df.columns or '着順' not in train_df.columns:
        return pd.DataFrame()
    
    valid = train_df[train_df['着順'].notna()].copy()
    valid['着順'] = valid['着順'].astype(float)
    
    stats = valid.groupby('登録番号').agg(
        履歴_平均着順=('着順', 'mean'),
        履歴_1着率=('着順', lambda x: (x == 1).mean()),
        履歴_出走回数=('着順', 'count'),
    ).reset_index()
    
    # イン1着率 (枠1のみ)
    in_data = valid[valid['枠'] == 1]
    if not in_data.empty:
        in_stats = in_data.groupby('登録番号').agg(
            イン1着率=('着順', lambda x: (x == 1).mean()),
        ).reset_index()
        stats = stats.merge(in_stats, on='登録番号', how='left')
    else:
        stats['イン1着率'] = np.nan
    
    return stats

def compute_stadium_player_stats(train_df):
    """選手×レース場統計 (学習データのみから計算)"""
    if not all(c in train_df.columns for c in ['登録番号', 'レース場', '着順']):
        return pd.DataFrame()
    
    valid = train_df[train_df['着順'].notna()].copy()
    valid['着順'] = valid['着順'].astype(float)
    
    stats = valid.groupby(['登録番号', 'レース場']).agg(
        当場_平均着順=('着順', 'mean'),
        当場_1着率=('着順', lambda x: (x == 1).mean()),
        当場_出走回数=('着順', 'count'),
    ).reset_index()
    
    return stats

print('Feature engineering functions ready')

Feature engineering functions ready


## データ読み込み (2016-2026)

In [7]:
cwd = Path.cwd()
repo_root = cwd if (cwd / 'data').exists() else cwd.parent.parent

print(f'Repository root: {repo_root}')

# Load data for 2016-2026
all_data = {}
years = [str(y) for y in range(2016, 2027)]

for year in years:
    for month in range(1, 13):
        _, max_day = calendar.monthrange(int(year), month)
        for day in range(1, max_day + 1):
            month_str = f'{month:02d}'
            day_str = f'{day:02d}'
            prog_path = repo_root / 'data' / 'programs' / year / month_str / f'{day_str}.csv'
            res_path = repo_root / 'data' / 'results' / year / month_str / f'{day_str}.csv'
            prev_path = repo_root / 'data' / 'previews' / year / month_str / f'{day_str}.csv'

            if prog_path.exists() and res_path.exists():
                date_key = f'{year}-{month_str}-{day_str}'
                try:
                    entry = {
                        'programs': pd.read_csv(prog_path),
                        'results': pd.read_csv(res_path),
                    }
                    if prev_path.exists():
                        entry['previews'] = pd.read_csv(prev_path)
                    all_data[date_key] = entry
                except Exception as e:
                    pass

print(f'Loaded {len(all_data)} days')

Repository root: /Users/mahiguch/dev/boatrace/data


Loaded 3689 days


### データ件数の年別確認

In [8]:
from collections import Counter
year_counts = Counter(k[:4] for k in all_data.keys())
for y in sorted(year_counts):
    print(f'{y}: {year_counts[y]} days')

2016: 364 days
2017: 365 days
2018: 363 days
2019: 365 days
2020: 366 days
2021: 365 days
2022: 365 days
2023: 365 days
2024: 366 days
2025: 365 days
2026: 40 days


## データ統合と特徴量生成

In [9]:
combined_data = []
processed_count = 0

for date_str, data in all_data.items():
    try:
        prog = reshape_programs(data['programs'])
        res = reshape_results(data['results'])

        if prog.empty or res.empty:
            continue

        prog['日次数'] = prog['日次'].apply(extract_day_number)
        prog['レース場'] = prog['レース場'].apply(map_stadium_name_to_number)
        prog = prog[prog['レース場'].notna()].reset_index(drop=True)

        if prog.empty:
            continue

        # Merge with results
        merged = prog.merge(
            res[['レースコード', '艇番', '着順']],
            on=['レースコード', '艇番'],
            how='left'
        )

        # Merge with previews if available
        if 'previews' in data:
            prev_long = reshape_previews(data['previews'])
            if not prev_long.empty:
                # レース場をdropして重複回避
                prev_cols_to_use = [c for c in prev_long.columns if c not in ['レース日', 'レース場', 'レース回']]
                merged = merged.merge(
                    prev_long[prev_cols_to_use],
                    on=['レースコード', '艇番'],
                    how='left'
                )

        combined_data.append(merged)
        processed_count += 1
    except Exception:
        continue

print(f'Processed {processed_count} days')

if combined_data:
    final_data = pd.concat(combined_data, ignore_index=True)
    print(f'Final shape: {final_data.shape}')
    print(f'Stadiums: {sorted(final_data["レース場"].dropna().unique().astype(int))}')
else:
    raise RuntimeError('No data merged')

Processed 3689 days


Final shape: (3348263, 48)
Stadiums: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24)]


### 特徴量生成パイプライン

In [10]:
# 今節成績特徴量
final_data = compute_konseki_features(final_data)
print(f'今節成績特徴量追加後: {final_data.shape}')

# 相対特徴量
final_data = compute_relative_features(final_data)
print(f'相対特徴量追加後: {final_data.shape}')

# コース特徴量
final_data = compute_course_features(final_data)
print(f'コース特徴量追加後: {final_data.shape}')

# 級別エンコード
if '級別' in final_data.columns:
    le_grade = LabelEncoder()
    final_data['級別_encoded'] = le_grade.fit_transform(final_data['級別'].fillna('未知'))
    print('級別エンコード完了')

print(f'\n最終カラム数: {len(final_data.columns)}')

今節成績特徴量追加後: (3348263, 53)


相対特徴量追加後: (3348263, 57)


コース特徴量追加後: (3348263, 62)


級別エンコード完了

最終カラム数: 63


In [11]:
# レース日をdatetimeに変換して年を抽出
final_data['レース日_dt'] = pd.to_datetime(final_data['レース日'], errors='coerce')
final_data['年'] = final_data['レース日_dt'].dt.year
final_data['月'] = final_data['レース日_dt'].dt.month

print('年別データ件数:')
print(final_data['年'].value_counts().sort_index())

年別データ件数:
年
2016    327967
2017    329649
2018    329018
2019    331702
2020    333260
2021    329538
2022    334538
2023    331904
2024    332668
2025    328908
2026     39111
Name: count, dtype: int64


In [12]:
# 選手履歴統計 (2016-2024のみから計算 → リーク防止)
train_mask = final_data['年'] <= 2024
train_subset = final_data[train_mask].copy()

player_stats = compute_player_historical_stats(train_subset)
print(f'選手統計: {len(player_stats)} players')

stadium_player_stats = compute_stadium_player_stats(train_subset)
print(f'選手×レース場統計: {len(stadium_player_stats)} records')

# 統計をマージ
final_data = final_data.merge(player_stats, on='登録番号', how='left')
final_data = final_data.merge(stadium_player_stats, on=['登録番号', 'レース場'], how='left')

print(f'統計マージ後: {final_data.shape}')

# 選手ST統計 (2016-2024のみから計算 → リーク防止)
def compute_player_st_stats(results_data):
    """Compute per-player start timing statistics from results data."""
    st_records = []
    for _, row in results_data.iterrows():
        race_code = row['レースコード']
        for place in range(1, 7):
            st_col = f'{place}着_スタートタイミング'
            reg_col = f'{place}着_登録番号'
            if st_col not in results_data.columns or reg_col not in results_data.columns:
                continue
            st_val = pd.to_numeric(row.get(st_col), errors='coerce')
            reg_val = pd.to_numeric(row.get(reg_col), errors='coerce')
            if pd.notna(st_val) and pd.notna(reg_val) and 0 <= st_val <= 0.50:
                st_records.append({'登録番号': int(reg_val), 'ST': st_val})
    if not st_records:
        return pd.DataFrame(columns=['登録番号', 'ST_mean', 'ST_std', 'ST_min'])
    st_df = pd.DataFrame(st_records)
    stats = st_df.groupby('登録番号')['ST'].agg(
        ST_mean='mean', ST_std='std', ST_min='min'
    ).reset_index()
    stats['ST_std'] = stats['ST_std'].fillna(0)
    return stats

# Load results for ST stat computation (train period only: 2016-2024)
import calendar as _cal
st_results_list = []
for yr in range(2016, 2025):
    for mo in range(1, 13):
        _, max_d = _cal.monthrange(yr, mo)
        for dy in range(1, max_d + 1):
            res_path = repo_root / 'data' / 'results' / str(yr) / f'{mo:02d}' / f'{dy:02d}.csv'
            if res_path.exists():
                try:
                    st_results_list.append(pd.read_csv(res_path))
                except Exception:
                    pass
if st_results_list:
    all_results_for_st = pd.concat(st_results_list, ignore_index=True)
    player_st_stats = compute_player_st_stats(all_results_for_st)
    print(f'選手ST統計: {len(player_st_stats)} players')
    
    # Merge ST stats
    final_data['登録番号'] = pd.to_numeric(final_data['登録番号'], errors='coerce')
    final_data = final_data.merge(player_st_stats, on='登録番号', how='left')
    final_data['ST_mean'] = final_data['ST_mean'].fillna(0.167)
    final_data['ST_std'] = final_data['ST_std'].fillna(0.068)
    final_data['ST_min'] = final_data['ST_min'].fillna(0.167)
    print(f'ST統計マージ後: {final_data.shape}')
else:
    player_st_stats = pd.DataFrame(columns=['登録番号', 'ST_mean', 'ST_std', 'ST_min'])
    print('WARNING: No results data found for ST stats')


選手統計: 2064 players


選手×レース場統計: 43754 records


統計マージ後: (3348263, 73)


選手ST統計: 2064 players


ST統計マージ後: (3348263, 76)


## 特徴量行列の作成

In [13]:
exclude_cols = {
    'レースコード', '日次', 'レース日', 'レース場', 'レース回',
    '艇番', '登録番号', '選手名', '支部', '早見',
    '枠', '着順',
    'モーター番号', 'ボート番号',
    'レース日_dt', '年', '月',
    # 今節成績の生カラム (特徴量化済み)
    'タイトル', 'レース名', '距離(m)', '電話投票締切予定',
}
# 今節成績の生カラムも除外
konseki_raw_cols = {f'今節成績_{i}-{j}' for i in range(1, 7) for j in [1, 2]}
exclude_cols |= konseki_raw_cols

# preview由来の非数値
exclude_cols |= {'風向', '天候'}

categorical_cols = {'級別'}

numeric_cols = []
for col in final_data.columns:
    if col not in exclude_cols and col not in categorical_cols:
        # 数値変換可能かテスト
        test_vals = pd.to_numeric(final_data[col], errors='coerce')
        if test_vals.notna().sum() > 0:
            numeric_cols.append(col)

feature_cols = numeric_cols.copy()
if '級別_encoded' in final_data.columns and '級別_encoded' not in feature_cols:
    feature_cols.append('級別_encoded')

X = final_data[feature_cols].copy()
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Improved NaN handling
place_cols = {'今節_平均着順', '今節_最新着順', '履歴_平均着順', '当場_平均着順'}
rate_cols = {'全国勝率', '全国2連対率', '当地勝率', '当地2連対率',
             'モーター2連対率', 'ボート2連対率', '今節_3連対率',
             '履歴_1着率', 'イン1着率', '当場_1着率'}

for col in X.columns:
    if col in place_cols:
        X[col] = X[col].fillna(3.5)
    elif col in rate_cols:
        median_val = X[col].median()
        X[col] = X[col].fillna(median_val if pd.notna(median_val) else 0)
    else:
        if X[col].notna().any():
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)
        else:
            X[col] = X[col].fillna(0)

y = final_data['着順']
stadiums = sorted(final_data['レース場'].dropna().unique())

print(f'Features: {len(feature_cols)}')
print(f'Samples: {len(X)}')
print(f'Feature list: {feature_cols}')

Features: 44
Samples: 3348263
Feature list: ['年齢', '体重', '全国勝率', '全国2連対率', '当地勝率', '当地2連対率', 'モーター2連対率', 'ボート2連対率', '日次数', '風速(m)', '波の高さ(cm)', '気温(℃)', '水温(℃)', 'コース', '体重(kg)', '体重調整(kg)', '展示タイム', 'チルト調整', 'スタート展示', '今節_平均着順', '今節_1着回数', '今節_3連対率', '今節_出走回数', '今節_最新着順', '全国勝率_偏差', '全国勝率_最大差', 'モーター2連対率_順位', '当地勝率_偏差', '枠×全国勝率', 'イン有利度', '風速×イン有利度', '強風フラグ', '強風×枠', '級別_encoded', '履歴_平均着順', '履歴_1着率', '履歴_出走回数', 'イン1着率', '当場_平均着順', '当場_1着率', '当場_出走回数', 'ST_mean', 'ST_std', 'ST_min']


### 時系列分割

In [14]:
# Train: 2016-2024, Val: 2025, Test: 2026-01, Final: 2026-02
year = final_data['年']
month = final_data['月']

train_idx = (year <= 2024) & y.notna()
val_idx = (year == 2025) & y.notna()
test_idx = (year == 2026) & (month == 1) & y.notna()
final_idx = (year == 2026) & (month == 2) & y.notna()

print(f'Train: {train_idx.sum():,}')
print(f'Val: {val_idx.sum():,}')
print(f'Test (2026-01): {test_idx.sum():,}')
print(f'Final (2026-02): {final_idx.sum():,}')

X_train, y_train = X[train_idx].reset_index(drop=True), y[train_idx].reset_index(drop=True)
X_val, y_val = X[val_idx].reset_index(drop=True), y[val_idx].reset_index(drop=True)
X_test, y_test = X[test_idx].reset_index(drop=True), y[test_idx].reset_index(drop=True)
X_final, y_final = X[final_idx].reset_index(drop=True), y[final_idx].reset_index(drop=True)

# レースコード (グループ情報)
race_codes_train = final_data.loc[train_idx, 'レースコード'].reset_index(drop=True)
race_codes_val = final_data.loc[val_idx, 'レースコード'].reset_index(drop=True)
race_codes_test = final_data.loc[test_idx, 'レースコード'].reset_index(drop=True)
race_codes_final = final_data.loc[final_idx, 'レースコード'].reset_index(drop=True)

stadium_train = final_data.loc[train_idx, 'レース場'].reset_index(drop=True)
stadium_val = final_data.loc[val_idx, 'レース場'].reset_index(drop=True)
stadium_test = final_data.loc[test_idx, 'レース場'].reset_index(drop=True)
stadium_final = final_data.loc[final_idx, 'レース場'].reset_index(drop=True)

Train: 2,249,410
Val: 252,628
Test (2026-01): 22,128
Final (2026-02): 6,272


## Model 1: LightGBM LambdaRank

In [15]:
def train_lambdarank_models(X_tr, y_tr, race_codes_tr, stadium_tr,
                            X_v, y_v, race_codes_v, stadium_v,
                            params=None):
    """レース場別 LambdaRank モデルを学習"""
    if params is None:
        params = {
            'objective': 'lambdarank',
            'metric': 'ndcg',
            'eval_at': [3],
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'min_child_samples': 20,
            'lambda_l1': 0.1,
            'lambda_l2': 0.1,
            'verbosity': -1,
        }
    
    models = {}
    results = []
    
    for stadium in sorted(stadium_tr.unique()):
        s_mask_tr = stadium_tr == stadium
        s_mask_v = stadium_v == stadium
        
        X_s_tr = X_tr[s_mask_tr]
        y_s_tr = y_tr[s_mask_tr]
        rc_s_tr = race_codes_tr[s_mask_tr]
        
        X_s_v = X_v[s_mask_v]
        y_s_v = y_v[s_mask_v]
        rc_s_v = race_codes_v[s_mask_v]
        
        if len(X_s_tr) < 100 or len(X_s_v) < 10:
            continue
        
        # relevance = 7 - 着順 (高い方が良い)
        rel_tr = (7 - y_s_tr).astype(int)
        rel_v = (7 - y_s_v).astype(int)
        
        # ソート (レースコード順)
        sort_idx_tr = rc_s_tr.argsort()
        X_s_tr = X_s_tr.iloc[sort_idx_tr].reset_index(drop=True)
        rel_tr = rel_tr.iloc[sort_idx_tr].reset_index(drop=True)
        rc_s_tr = rc_s_tr.iloc[sort_idx_tr].reset_index(drop=True)
        
        sort_idx_v = rc_s_v.argsort()
        X_s_v = X_s_v.iloc[sort_idx_v].reset_index(drop=True)
        rel_v = rel_v.iloc[sort_idx_v].reset_index(drop=True)
        rc_s_v = rc_s_v.iloc[sort_idx_v].reset_index(drop=True)
        
        # グループサイズ
        group_tr = rc_s_tr.value_counts().sort_index().values
        group_v = rc_s_v.value_counts().sort_index().values
        
        train_set = lgb.Dataset(X_s_tr, label=rel_tr, group=group_tr)
        val_set = lgb.Dataset(X_s_v, label=rel_v, group=group_v, reference=train_set)
        
        callbacks = [
            lgb.early_stopping(50, verbose=False),
            lgb.log_evaluation(0),
        ]
        
        model = lgb.train(
            params, train_set,
            num_boost_round=500,
            valid_sets=[val_set],
            callbacks=callbacks,
        )
        
        models[stadium] = model
        
        # 評価: 1着的中率
        scores_v = model.predict(X_s_v)
        correct_1st = 0
        total_races = 0
        start = 0
        for gs in group_v:
            group_scores = scores_v[start:start+gs]
            group_actual = rel_v.iloc[start:start+gs].values
            pred_rank = np.argsort(-group_scores)
            actual_best = np.argmax(group_actual)
            if pred_rank[0] == actual_best:
                correct_1st += 1
            total_races += 1
            start += gs
        
        acc = correct_1st / total_races if total_races > 0 else 0
        results.append({'stadium': int(stadium), 'acc_1st': acc, 'n_races': total_races})
    
    return models, pd.DataFrame(results)

ranking_models, ranking_results = train_lambdarank_models(
    X_train, y_train, race_codes_train, stadium_train,
    X_val, y_val, race_codes_val, stadium_val
)

print('=== LambdaRank Results (Validation) ===')
print(ranking_results.to_string(index=False))
print(f'\nAverage 1st accuracy: {ranking_results["acc_1st"].mean():.3f}')

=== LambdaRank Results (Validation) ===
 stadium  acc_1st  n_races
       1 0.507330     2251
       2 0.459448     2466
       3 0.479120     2227
       4 0.487657     1985
       5 0.513136     2360
       6 0.529019     2395
       7 0.575821     2374
       8 0.580056     2517
       9 0.570530     2226
      10 0.540235     2299
      11 0.535155     2233
      12 0.583866     2504
      13 0.453429      977
      14 0.557664     1370
      15 0.578804     1104
      16 0.560502      876
      17 0.608365     1052
      18 0.593152      993
      19 0.574906     1068
      20 0.573517     1231
      21 0.592233      927
      22 0.483598      945
      23 0.558089     1842
      24 0.641296     2470

Average 1st accuracy: 0.547


## Model 2: LightGBM Classifier

In [16]:
def train_classifier_models(X_tr, y_tr, stadium_tr, X_v, y_v, stadium_v):
    """レース場別 LightGBM Classifier モデルを学習"""
    models = {}
    scalers = {}
    results = []
    
    for stadium in sorted(stadium_tr.unique()):
        s_mask_tr = stadium_tr == stadium
        s_mask_v = stadium_v == stadium
        
        X_s_tr = X_tr[s_mask_tr].reset_index(drop=True)
        y_s_tr = y_tr[s_mask_tr].reset_index(drop=True)
        X_s_v = X_v[s_mask_v].reset_index(drop=True)
        y_s_v = y_v[s_mask_v].reset_index(drop=True)
        
        if len(X_s_tr) < 100 or len(X_s_v) < 10:
            continue
        
        scaler = StandardScaler()
        X_s_tr_s = scaler.fit_transform(X_s_tr)
        X_s_v_s = scaler.transform(X_s_v)
        
        clf = lgb.LGBMClassifier(
            n_estimators=300,
            num_leaves=31,
            learning_rate=0.05,
            feature_fraction=0.8,
            bagging_fraction=0.8,
            bagging_freq=5,
            min_child_samples=20,
            verbosity=-1,
            random_state=42,
        )
        
        clf.fit(
            X_s_tr_s, y_s_tr,
            eval_set=[(X_s_v_s, y_s_v)],
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)],
        )
        
        models[stadium] = clf
        scalers[stadium] = scaler
        
        acc = accuracy_score(y_s_v, clf.predict(X_s_v_s))
        results.append({'stadium': int(stadium), 'accuracy': acc})
    
    return models, scalers, pd.DataFrame(results)

classifier_models, classifier_scalers, classifier_results = train_classifier_models(
    X_train, y_train, stadium_train,
    X_val, y_val, stadium_val
)

print('=== LightGBM Classifier Results (Validation) ===')
print(classifier_results.to_string(index=False))
print(f'\nAverage accuracy: {classifier_results["accuracy"].mean():.3f}')

=== LightGBM Classifier Results (Validation) ===
 stadium  accuracy
       1  0.306116
       2  0.296261
       3  0.296050
       4  0.297634
       5  0.309997
       6  0.316669
       7  0.338697
       8  0.331925
       9  0.327076
      10  0.323195
      11  0.314739
      12  0.324839
      13  0.259758
      14  0.283070
      15  0.282439
      16  0.286377
      17  0.309608
      18  0.297445
      19  0.275135
      20  0.293025
      21  0.303578
      22  0.261794
      23  0.324475
      24  0.345973

Average accuracy: 0.304


## Model 3: GBC ベースライン (時系列分割)

In [17]:
def train_gbc_models(X_tr, y_tr, stadium_tr, X_v, y_v, stadium_v):
    """レース場別 GBC モデルを学習"""
    models = {}
    scalers = {}
    results = []
    
    for stadium in sorted(stadium_tr.unique()):
        s_mask_tr = stadium_tr == stadium
        s_mask_v = stadium_v == stadium
        
        X_s_tr = X_tr[s_mask_tr].reset_index(drop=True)
        y_s_tr = y_tr[s_mask_tr].reset_index(drop=True)
        X_s_v = X_v[s_mask_v].reset_index(drop=True)
        y_s_v = y_v[s_mask_v].reset_index(drop=True)
        
        if len(X_s_tr) < 100 or len(X_s_v) < 10:
            continue
        
        scaler = StandardScaler()
        X_s_tr_s = scaler.fit_transform(X_s_tr)
        X_s_v_s = scaler.transform(X_s_v)
        
        gbc = GradientBoostingClassifier(
            n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
        )
        gbc.fit(X_s_tr_s, y_s_tr)
        
        models[stadium] = gbc
        scalers[stadium] = scaler
        
        acc = accuracy_score(y_s_v, gbc.predict(X_s_v_s))
        results.append({'stadium': int(stadium), 'accuracy': acc})
    
    return models, scalers, pd.DataFrame(results)

gbc_models, gbc_scalers, gbc_results = train_gbc_models(
    X_train, y_train, stadium_train,
    X_val, y_val, stadium_val
)

print('=== GBC Baseline Results (Validation) ===')
print(gbc_results.to_string(index=False))
print(f'\nAverage accuracy: {gbc_results["accuracy"].mean():.3f}')

=== GBC Baseline Results (Validation) ===
 stadium  accuracy
       1  0.305366
       2  0.295370
       3  0.297121
       4  0.301136
       5  0.312500
       6  0.314342
       7  0.334781
       8  0.330784
       9  0.325486
      10  0.323342
      11  0.314210
      12  0.322677
      13  0.258031
      14  0.279738
      15  0.284732
      16  0.286957
      17  0.311857
      18  0.296422
      19  0.275135
      20  0.285596
      21  0.292808
      22  0.260722
      23  0.323834
      24  0.341745

Average accuracy: 0.303


## Optuna チューニング (LambdaRank)

In [18]:
def optuna_lambdarank(X_tr, y_tr, rc_tr, X_v, y_v, rc_v, n_trials=30):
    """全スタジアム統合データで LambdaRank のハイパラ探索"""
    # relevance
    rel_tr = (7 - y_tr).astype(int)
    rel_v = (7 - y_v).astype(int)
    
    # ソート
    sort_tr = rc_tr.argsort()
    X_tr_s = X_tr.iloc[sort_tr].reset_index(drop=True)
    rel_tr_s = rel_tr.iloc[sort_tr].reset_index(drop=True)
    rc_tr_s = rc_tr.iloc[sort_tr].reset_index(drop=True)
    
    sort_v = rc_v.argsort()
    X_v_s = X_v.iloc[sort_v].reset_index(drop=True)
    rel_v_s = rel_v.iloc[sort_v].reset_index(drop=True)
    rc_v_s = rc_v.iloc[sort_v].reset_index(drop=True)
    
    group_tr = rc_tr_s.value_counts().sort_index().values
    group_v = rc_v_s.value_counts().sort_index().values
    
    train_set = lgb.Dataset(X_tr_s, label=rel_tr_s, group=group_tr)
    val_set = lgb.Dataset(X_v_s, label=rel_v_s, group=group_v, reference=train_set)
    
    def objective(trial):
        params = {
            'objective': 'lambdarank',
            'metric': 'ndcg',
            'eval_at': [3],
            'num_leaves': trial.suggest_int('num_leaves', 15, 63),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
            'bagging_freq': 5,
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
            'verbosity': -1,
            'feature_pre_filter': False,
        }
        
        callbacks = [
            lgb.early_stopping(50, verbose=False),
            lgb.log_evaluation(0),
        ]
        
        model = lgb.train(
            params, train_set,
            num_boost_round=500,
            valid_sets=[val_set],
            callbacks=callbacks,
        )
        
        # NDCG@3 を返す
        return model.best_score['valid_0']['ndcg@3']
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    return study.best_params, study

best_params, study = optuna_lambdarank(
    X_train, y_train, race_codes_train,
    X_val, y_val, race_codes_val,
    n_trials=30
)
print(f'Best NDCG@3: {study.best_value:.4f}')
print(f'Best params: {best_params}')

  0%|          | 0/30 [00:00<?, ?it/s]

Best NDCG@3: 0.7447
Best params: {'num_leaves': 45, 'learning_rate': 0.029379067226175583, 'min_child_samples': 21, 'feature_fraction': 0.7063282398260761, 'bagging_fraction': 0.8158706296559669, 'lambda_l1': 3.379804008090816, 'lambda_l2': 0.13737266883917645}


### 最適パラメータで再学習

In [19]:
# 最適パラメータでレース場別モデルを再学習
opt_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'eval_at': [3],
    'verbosity': -1,
    'bagging_freq': 5,
}
opt_params.update(best_params)

ranking_models_opt, ranking_results_opt = train_lambdarank_models(
    X_train, y_train, race_codes_train, stadium_train,
    X_val, y_val, race_codes_val, stadium_val,
    params=opt_params
)

print('=== Optimized LambdaRank Results (Validation) ===')
print(ranking_results_opt.to_string(index=False))
print(f'\nAverage 1st accuracy: {ranking_results_opt["acc_1st"].mean():.3f}')

# 改善があれば最適化モデルを使用
if ranking_results_opt['acc_1st'].mean() > ranking_results['acc_1st'].mean():
    print('\nOptimized model is better, using it.')
    ranking_models = ranking_models_opt
else:
    print('\nDefault model is better, keeping it.')

=== Optimized LambdaRank Results (Validation) ===
 stadium  acc_1st  n_races
       1 0.512217     2251
       2 0.458637     2466
       3 0.484508     2227
       4 0.487154     1985
       5 0.527119     2360
       6 0.536117     2395
       7 0.579191     2374
       8 0.582042     2517
       9 0.570979     2226
      10 0.543715     2299
      11 0.543663     2233
      12 0.588658     2504
      13 0.454452      977
      14 0.572993     1370
      15 0.584239     1104
      16 0.552511      876
      17 0.603612     1052
      18 0.599194      993
      19 0.579588     1068
      20 0.576767     1231
      21 0.593312      927
      22 0.496296      945
      23 0.560803     1842
      24 0.639676     2470

Average 1st accuracy: 0.551

Optimized model is better, using it.


## アンサンブル

In [20]:
def compute_ensemble_scores(X_data, y_data, race_codes, stadium_data,
                            ranking_models, classifier_models, classifier_scalers,
                            gbc_models, gbc_scalers,
                            weights=(0.5, 0.3, 0.2)):
    """3モデルのスコアを重み付け合算してアンサンブル予測"""
    w_rank, w_cls, w_gbc = weights
    
    results = []
    
    for stadium in sorted(stadium_data.unique()):
        s_mask = stadium_data == stadium
        X_s = X_data[s_mask].reset_index(drop=True)
        y_s = y_data[s_mask].reset_index(drop=True)
        rc_s = race_codes[s_mask].reset_index(drop=True)
        
        n = len(X_s)
        scores = np.zeros(n)
        
        has_rank = stadium in ranking_models
        has_cls = stadium in classifier_models
        has_gbc = stadium in gbc_models
        
        if not (has_rank or has_cls or has_gbc):
            continue
        
        # LambdaRank scores
        if has_rank:
            rank_scores = ranking_models[stadium].predict(X_s)
        else:
            rank_scores = np.zeros(n)
        
        # Classifier: predict_proba → 期待着順 → 反転
        if has_cls:
            X_s_scaled = classifier_scalers[stadium].transform(X_s)
            proba = classifier_models[stadium].predict_proba(X_s_scaled)
            classes = classifier_models[stadium].classes_
            expected_place = proba @ classes.astype(float)
            cls_scores = -expected_place  # 低い着順が良い → 反転
        else:
            cls_scores = np.zeros(n)
        
        # GBC: predict_proba → 期待着順 → 反転
        if has_gbc:
            X_s_scaled_gbc = gbc_scalers[stadium].transform(X_s)
            proba_gbc = gbc_models[stadium].predict_proba(X_s_scaled_gbc)
            classes_gbc = gbc_models[stadium].classes_
            expected_place_gbc = proba_gbc @ classes_gbc.astype(float)
            gbc_scores = -expected_place_gbc
        else:
            gbc_scores = np.zeros(n)
        
        # レース内 min-max 正規化
        for rc in rc_s.unique():
            rc_mask = (rc_s == rc).values
            for arr in [rank_scores, cls_scores, gbc_scores]:
                vals = arr[rc_mask]
                vmin, vmax = vals.min(), vals.max()
                if vmax > vmin:
                    arr[rc_mask] = (vals - vmin) / (vmax - vmin)
                else:
                    arr[rc_mask] = 0.5
        
        ensemble = w_rank * rank_scores + w_cls * cls_scores + w_gbc * gbc_scores
        
        for rc in rc_s.unique():
            rc_mask = (rc_s == rc).values
            group_ensemble = ensemble[rc_mask]
            group_y = y_s[rc_mask].values
            
            if len(group_ensemble) < 6 or np.isnan(group_y).any():
                continue
            
            pred_order = np.argsort(-group_ensemble)
            actual_order = np.argsort(group_y)
            
            hit_1 = pred_order[0] == actual_order[0]
            hit_2 = pred_order[1] == actual_order[1]
            hit_3 = pred_order[2] == actual_order[2]
            hit_sanrentan = hit_1 and hit_2 and hit_3
            
            results.append({
                'stadium': int(stadium),
                'race_code': rc,
                'hit_1': hit_1,
                'hit_2': hit_2,
                'hit_3': hit_3,
                'hit_sanrentan': hit_sanrentan,
            })
    
    return pd.DataFrame(results)

# Validation set でグリッドサーチ
best_w = None
best_sanrentan = 0

weight_candidates = [
    (0.5, 0.3, 0.2), (0.6, 0.2, 0.2), (0.4, 0.4, 0.2),
    (0.4, 0.3, 0.3), (0.7, 0.2, 0.1), (0.5, 0.4, 0.1),
    (0.3, 0.5, 0.2), (0.6, 0.3, 0.1), (0.5, 0.25, 0.25),
    (0.45, 0.35, 0.2), (0.55, 0.25, 0.2), (0.65, 0.2, 0.15),
]

print('Weight search on validation set:')
for w in weight_candidates:
    res = compute_ensemble_scores(
        X_val, y_val, race_codes_val, stadium_val,
        ranking_models, classifier_models, classifier_scalers,
        gbc_models, gbc_scalers,
        weights=w
    )
    if res.empty:
        continue
    hit_rates = {
        '1着': res['hit_1'].mean(),
        '2着': res['hit_2'].mean(),
        '3着': res['hit_3'].mean(),
        '三連単': res['hit_sanrentan'].mean(),
    }
    print(f'  w={w}: 1着={hit_rates["1着"]:.3f} 2着={hit_rates["2着"]:.3f} 3着={hit_rates["3着"]:.3f} 三連単={hit_rates["三連単"]:.3f}')
    
    if hit_rates['三連単'] > best_sanrentan:
        best_sanrentan = hit_rates['三連単']
        best_w = w

print(f'\nBest weights: {best_w}')
print(f'Best 三連単: {best_sanrentan:.3f}')

Weight search on validation set:


  w=(0.5, 0.3, 0.2): 1着=0.547 2着=0.274 3着=0.219 三連単=0.081


  w=(0.6, 0.2, 0.2): 1着=0.547 2着=0.275 3着=0.219 三連単=0.081


  w=(0.4, 0.4, 0.2): 1着=0.547 2着=0.274 3着=0.221 三連単=0.081


  w=(0.4, 0.3, 0.3): 1着=0.546 2着=0.274 3着=0.220 三連単=0.081


  w=(0.7, 0.2, 0.1): 1着=0.548 2着=0.274 3着=0.219 三連単=0.081


  w=(0.5, 0.4, 0.1): 1着=0.548 2着=0.273 3着=0.219 三連単=0.080


  w=(0.3, 0.5, 0.2): 1着=0.546 2着=0.274 3着=0.220 三連単=0.080


  w=(0.6, 0.3, 0.1): 1着=0.548 2着=0.273 3着=0.219 三連単=0.081


  w=(0.5, 0.25, 0.25): 1着=0.547 2着=0.274 3着=0.219 三連単=0.081


  w=(0.45, 0.35, 0.2): 1着=0.547 2着=0.274 3着=0.219 三連単=0.081


  w=(0.55, 0.25, 0.2): 1着=0.548 2着=0.274 3着=0.220 三連単=0.081


  w=(0.65, 0.2, 0.15): 1着=0.548 2着=0.274 3着=0.219 三連単=0.081

Best weights: (0.7, 0.2, 0.1)
Best 三連単: 0.081


## テスト評価 2026-01

In [21]:
def evaluate_model_accuracy(X_data, y_data, race_codes, stadium_data,
                            models, scalers=None, model_type='classifier'):
    """単一モデルの的中率評価"""
    results = []
    
    for stadium in sorted(stadium_data.unique()):
        s_mask = stadium_data == stadium
        X_s = X_data[s_mask].reset_index(drop=True)
        y_s = y_data[s_mask].reset_index(drop=True)
        rc_s = race_codes[s_mask].reset_index(drop=True)
        
        if stadium not in models:
            continue
        
        model = models[stadium]
        
        for rc in rc_s.unique():
            rc_mask = (rc_s == rc).values
            X_rc = X_s[rc_mask]
            y_rc = y_s[rc_mask].values
            
            if len(X_rc) < 6 or np.isnan(y_rc).any():
                continue
            
            actual_order = np.argsort(y_rc)
            
            if model_type == 'lambdarank':
                scores = model.predict(X_rc)
                pred_order = np.argsort(-scores)
            elif model_type == 'classifier':
                X_scaled = scalers[stadium].transform(X_rc) if scalers else X_rc
                proba = model.predict_proba(X_scaled)
                classes = model.classes_
                expected_place = proba @ classes.astype(float)
                pred_order = np.argsort(expected_place)
            elif model_type == 'gbc':
                X_scaled = scalers[stadium].transform(X_rc) if scalers else X_rc
                proba = model.predict_proba(X_scaled)
                classes = model.classes_
                expected_place = proba @ classes.astype(float)
                pred_order = np.argsort(expected_place)
            
            hit_1 = pred_order[0] == actual_order[0]
            hit_2 = pred_order[1] == actual_order[1]
            hit_3 = pred_order[2] == actual_order[2]
            hit_sanrentan = hit_1 and hit_2 and hit_3
            
            results.append({
                'hit_1': hit_1, 'hit_2': hit_2, 'hit_3': hit_3,
                'hit_sanrentan': hit_sanrentan,
            })
    
    return pd.DataFrame(results)

print('Evaluation function ready')

Evaluation function ready


In [22]:
# 4モデル比較 (2026-01 テストデータ)
print('=== 2026-01 テストデータ評価 ===\n')

# Model 1: LambdaRank
res_rank = evaluate_model_accuracy(X_test, y_test, race_codes_test, stadium_test,
                                    ranking_models, model_type='lambdarank')

# Model 2: LightGBM Classifier
res_cls = evaluate_model_accuracy(X_test, y_test, race_codes_test, stadium_test,
                                   classifier_models, classifier_scalers, model_type='classifier')

# Model 3: GBC Baseline
res_gbc = evaluate_model_accuracy(X_test, y_test, race_codes_test, stadium_test,
                                   gbc_models, gbc_scalers, model_type='gbc')

# Model 4: Ensemble
res_ens = compute_ensemble_scores(
    X_test, y_test, race_codes_test, stadium_test,
    ranking_models, classifier_models, classifier_scalers,
    gbc_models, gbc_scalers,
    weights=best_w
)

comparison = pd.DataFrame({
    'Model': ['LambdaRank', 'LGBMClassifier', 'GBC Baseline', 'Ensemble'],
    '1着': [
        res_rank['hit_1'].mean() if not res_rank.empty else 0,
        res_cls['hit_1'].mean() if not res_cls.empty else 0,
        res_gbc['hit_1'].mean() if not res_gbc.empty else 0,
        res_ens['hit_1'].mean() if not res_ens.empty else 0,
    ],
    '2着': [
        res_rank['hit_2'].mean() if not res_rank.empty else 0,
        res_cls['hit_2'].mean() if not res_cls.empty else 0,
        res_gbc['hit_2'].mean() if not res_gbc.empty else 0,
        res_ens['hit_2'].mean() if not res_ens.empty else 0,
    ],
    '3着': [
        res_rank['hit_3'].mean() if not res_rank.empty else 0,
        res_cls['hit_3'].mean() if not res_cls.empty else 0,
        res_gbc['hit_3'].mean() if not res_gbc.empty else 0,
        res_ens['hit_3'].mean() if not res_ens.empty else 0,
    ],
    '三連単': [
        res_rank['hit_sanrentan'].mean() if not res_rank.empty else 0,
        res_cls['hit_sanrentan'].mean() if not res_cls.empty else 0,
        res_gbc['hit_sanrentan'].mean() if not res_gbc.empty else 0,
        res_ens['hit_sanrentan'].mean() if not res_ens.empty else 0,
    ],
})

# 百分率表示
for col in ['1着', '2着', '3着', '三連単']:
    comparison[col] = (comparison[col] * 100).round(1).astype(str) + '%'

print(comparison.to_string(index=False))
print(f'\n現行性能参考: 1着 53.8%, 2着 25.2%, 3着 19.8%, 三連単 6.7%')

=== 2026-01 テストデータ評価 ===



         Model    1着    2着    3着  三連単
    LambdaRank 54.0% 26.2% 21.1% 7.3%
LGBMClassifier 52.3% 25.7% 21.4% 6.7%
  GBC Baseline 52.0% 25.1% 20.7% 6.6%
      Ensemble 53.3% 25.8% 21.3% 7.1%

現行性能参考: 1着 53.8%, 2着 25.2%, 3着 19.8%, 三連単 6.7%


### レース場別の詳細比較

In [23]:
# アンサンブルのレース場別的中率
if not res_ens.empty:
    stadium_detail = res_ens.groupby('stadium').agg(
        n_races=('hit_1', 'count'),
        hit_1=('hit_1', 'mean'),
        hit_2=('hit_2', 'mean'),
        hit_3=('hit_3', 'mean'),
        hit_sanrentan=('hit_sanrentan', 'mean'),
    ).reset_index()
    
    for col in ['hit_1', 'hit_2', 'hit_3', 'hit_sanrentan']:
        stadium_detail[col] = (stadium_detail[col] * 100).round(1)
    
    print('=== アンサンブル レース場別的中率 (2026-01) ===')
    print(stadium_detail.to_string(index=False))

=== アンサンブル レース場別的中率 (2026-01) ===
 stadium  n_races  hit_1  hit_2  hit_3  hit_sanrentan
       1      193   50.8   30.1   19.2            8.3
       2      166   47.6   27.7   20.5            9.0
       3      156   42.3   26.3   14.1            3.8
       4      187   44.4   21.4   18.2            5.9
       5      157   51.0   25.5   20.4            7.6
       6      178   53.4   28.7   24.2           10.7
       7      199   56.3   27.6   26.6            7.0
       8       80   51.2   25.0   18.8            6.2
       9      147   55.1   29.9   26.5           10.9
      10      144   47.9   19.4   22.2            7.6
      11      156   54.5   28.8   22.4            5.8
      12      177   59.3   26.0   26.6            9.0
      13      118   46.6   31.4   24.6            5.1
      14       93   51.6   28.0   19.4            7.5
      15      133   57.1   22.6   24.8            6.0
      16       92   58.7   27.2   19.6            5.4
      17      124   58.9   28.2   19.4          

## 最終検証 2026-02

In [24]:
# 2026-02 テストデータ評価
if final_idx.sum() > 0:
    res_ens_final = compute_ensemble_scores(
        X_final, y_final, race_codes_final, stadium_final,
        ranking_models, classifier_models, classifier_scalers,
        gbc_models, gbc_scalers,
        weights=best_w
    )
    
    if not res_ens_final.empty:
        print('=== 2026-02 最終検証 (Ensemble) ===')
        print(f'1着: {res_ens_final["hit_1"].mean()*100:.1f}%')
        print(f'2着: {res_ens_final["hit_2"].mean()*100:.1f}%')
        print(f'3着: {res_ens_final["hit_3"].mean()*100:.1f}%')
        print(f'三連単: {res_ens_final["hit_sanrentan"].mean()*100:.1f}%')
        print(f'\nレース数: {len(res_ens_final)}')
    else:
        print('2026-02 データが不足しています')
else:
    print('2026-02 データなし')

=== 2026-02 最終検証 (Ensemble) ===
1着: 52.8%
2着: 25.2%
3着: 19.1%
三連単: 5.7%

レース数: 978


## 特徴量重要度

In [25]:
# LambdaRank の feature importance (gain) を全スタジアム平均
importance_list = []
for stadium, model in ranking_models.items():
    imp = model.feature_importance(importance_type='gain')
    importance_list.append(imp)

if importance_list:
    avg_importance = np.mean(importance_list, axis=0)
    feat_imp = pd.DataFrame({
        'feature': feature_cols,
        'importance': avg_importance,
    }).sort_values('importance', ascending=False)
    
    print('=== Top 20 Feature Importance (LambdaRank, gain) ===')
    print(feat_imp.head(20).to_string(index=False))
else:
    print('No ranking models available')

=== Top 20 Feature Importance (LambdaRank, gain) ===
    feature   importance
        コース 49633.818466
      イン有利度 32808.053749
    全国勝率_偏差 15394.140736
     当場_1着率  5861.726003
   風速×イン有利度  5471.695149
   全国勝率_最大差  4249.662043
    当場_平均着順  3723.808054
      イン1着率  1927.519437
     枠×全国勝率  1259.215818
      展示タイム   867.453566
     履歴_1着率   655.195059
モーター2連対率_順位   510.167777
     体重(kg)   507.197625
      風速(m)   410.250382
   モーター2連対率   394.968864
     スタート展示   333.610524
    当地勝率_偏差   314.332907
    今節_平均着順   136.112470
     全国2連対率   130.622422
    履歴_平均着順   120.541385


## モデル保存

In [26]:
# models/program_models_v2.pkl に保存
# estimate.py 互換: {stadium: {'model': clf, 'scaler': scaler, 'features': [list]}}
save_dict = {}

for stadium in stadiums:
    entry = {'features': feature_cols}
    
    # GBC or Classifier as primary model (estimate.py compatibility)
    if stadium in classifier_models:
        entry['model'] = classifier_models[stadium]
        entry['scaler'] = classifier_scalers[stadium]
    elif stadium in gbc_models:
        entry['model'] = gbc_models[stadium]
        entry['scaler'] = gbc_scalers[stadium]
    
    # Additional models
    if stadium in ranking_models:
        entry['ranking_model'] = ranking_models[stadium]
    if stadium in classifier_models:
        entry['classifier_model'] = classifier_models[stadium]
        entry['classifier_scaler'] = classifier_scalers[stadium]
    
    save_dict[stadium] = entry

# Ensemble weights and stats
save_dict['_ensemble_weights'] = best_w
save_dict['_player_stats'] = player_stats
save_dict['_stadium_player_stats'] = stadium_player_stats
save_dict['_player_st_stats'] = player_st_stats

model_save_path = repo_root / 'models' / 'program_models_v2.pkl'
model_save_path.parent.mkdir(parents=True, exist_ok=True)

with open(model_save_path, 'wb') as f:
    pickle.dump(save_dict, f)

print(f'Saved to {model_save_path}')
print(f'Keys: {[k for k in save_dict.keys() if not str(k).startswith("_")]}')
print(f'Ensemble weights: {best_w}')

Saved to /Users/mahiguch/dev/boatrace/data/models/program_models_v2.pkl
Keys: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24)]
Ensemble weights: (0.7, 0.2, 0.1)
