# Previews データ予測モデル

対象：2016年～2025年の過去データから、当日のPreviewsデータを予測

## 実装内容
1. **展示タイム予測モデル**（GradientBoostingRegressor） ★★★★★
2. **進入コース予測モデル**（GradientBoostingClassifier） ★★★★★
3. **スタート展示予測モデル**（GradientBoostingRegressor） ★★★★☆
4. **チルト調整予測モデル**（GradientBoostingRegressor） ★★★☆☆

## 出力
予測Previewsデータを `data/prediction-preview/YYYY/MM/DD.csv` に保存

## 目標精度
- 展示タイム: MAE < 0.05秒
- 進入コース: 的中率 > 70%
- スタート展示: MAE < 0.10秒
- チルト調整: 的中率 > 60%

## セットアップ

In [1]:
from pathlib import Path
import calendar
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
import pickle
import warnings
warnings.filterwarnings('ignore')

cwd = Path.cwd()
repo_root = cwd if (cwd / 'data').exists() else cwd.parent.parent

print(f'Repository root: {repo_root}')
print('Setup complete')

Repository root: /Users/mahiguch/dev/boatrace/data
Setup complete


## データ変形関数の定義

In [2]:
def reshape_programs(df):
    """
    Programs を艇単位に変形
    Programs の枠 (1枠_～) を艇番として扱う
    """
    frames = []
    race_cols = ['レースコード', 'レース日', 'レース場', 'レース回']
    
    for frame in range(1, 7):
        prefix = f'{frame}枠_'
        cols = [c for c in df.columns if c.startswith(prefix)]
        if cols:
            tmp = df[race_cols + cols].copy()
            tmp.columns = race_cols + [c[len(prefix):] for c in cols]
            tmp['艇番'] = frame  # 枠番号 = 艇番
            frames.append(tmp)
    
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

def reshape_previews(df):
    """
    Previews を艇単位に変形
    各艇の情報を1行に集約
    """
    frames = []
    
    race_cols = ['レースコード', 'レース日', 'レース場', 'レース回']
    race_attrs = ['風速(m)', '風向', '波の高さ(cm)', '天候', '気温(℃)', '水温(℃)']
    
    for boat in range(1, 7):
        prefix = f'艇{boat}_'
        boat_cols = [c for c in df.columns if c.startswith(prefix)]
        if boat_cols:
            tmp = df[race_cols + race_attrs + boat_cols].copy()
            boat_col_names = [c[len(prefix):] for c in boat_cols]
            tmp.columns = race_cols + race_attrs + boat_col_names
            tmp['艇番'] = boat
            frames.append(tmp)
    
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

def reshape_results(df):
    """
    Results を艇単位に変形
    着順情報を艇番とマッチング
    """
    result_list = []
    
    for idx, row in df.iterrows():
        race_code = row['レースコード']
        
        for place in range(1, 7):
            boat_col = f'{place}着_艇番'
            if boat_col in df.columns and pd.notna(row[boat_col]):
                try:
                    boat_num = int(row[boat_col])
                    if boat_num < 1 or boat_num > 6:
                        continue
                    result_list.append({
                        'レースコード': race_code,
                        '艇番': boat_num,
                        '着順': place
                    })
                except (ValueError, TypeError):
                    continue
    
    return pd.DataFrame(result_list) if result_list else pd.DataFrame()

print('Reshape functions ready')

Reshape functions ready


## 2016～2025年の過去データで特徴量を抽出

### 1. データ読み込み

In [3]:
# Load data for 2025 only
all_data = {}
year = '2025'

for month in range(1, 13):
    _, max_day = calendar.monthrange(int(year), month)
    for day in range(1, max_day + 1):
        month_str = f'{month:02d}'
        day_str = f'{day:02d}'
        prog_path = repo_root / 'data' / 'programs' / year / month_str / f'{day_str}.csv'
        prev_path = repo_root / 'data' / 'previews' / year / month_str / f'{day_str}.csv'
        res_path = repo_root / 'data' / 'results' / year / month_str / f'{day_str}.csv'
        
        if prog_path.exists() and prev_path.exists() and res_path.exists():
            date_key = f'{year}-{month_str}-{day_str}'
            try:
                all_data[date_key] = {
                    'programs': pd.read_csv(prog_path),
                    'previews': pd.read_csv(prev_path),
                    'results': pd.read_csv(res_path)
                }
            except Exception as e:
                pass

print(f'✓ Loaded {len(all_data)} days (2025 only)')

✓ Loaded 365 days (2025 only)


### 2. Stadium name to number mapping

In [4]:
STADIUM_NAME_TO_NUMBER = {
    'ボートレース桐生': 1,
    'ボートレース戸田': 2,
    'ボートレース江戸川': 3,
    'ボートレース平和島': 4,
    'ボートレース多摩川': 5,
    'ボートレース浜名湖': 6,
    'ボートレース蒲郡': 7,
    'ボートレース常滑': 8,
    'ボートレース津': 9,
    'ボートレース三国': 10,
    'ボートレースびわこ': 11,
    'ボートレース琵琶湖': 11,
    'ボートレース住之江': 12,
    'ボートレース尼崎': 13,
    'ボートレース鳴門': 14,
    'ボートレース丸亀': 15,
    'ボートレース児島': 16,
    'ボートレース宮島': 17,
    'ボートレース徳山': 18,
    'ボートレース下関': 19,
    'ボートレース若松': 20,
    'ボートレース芦屋': 21,
    'ボートレース福岡': 22,
    'ボートレース唐津': 23,
    'ボートレース大村': 24,
}

def map_stadium_name_to_number(stadium_name):
    if pd.isna(stadium_name):
        return np.nan
    stadium_name = str(stadium_name).strip()
    return STADIUM_NAME_TO_NUMBER.get(stadium_name, np.nan)

print('Stadium mapping ready')

Stadium mapping ready


### 3. データ統合

In [5]:
# Combine programs, previews, and results
# Using exact logic from stadium.ipynb
combined_data = []

for date_str, data in all_data.items():
    try:
        prog = reshape_programs(data['programs'])
        prev = reshape_previews(data['previews'])
        res = reshape_results(data['results'])
        
        if prev.empty or prog.empty or res.empty:
            continue
        
        # Step 1: Merge previews + programs
        # Handle overlapping columns
        prog_cols = set(prog.columns)
        prev_cols = set(prev.columns)
        overlap_cols = prog_cols & prev_cols - {'レースコード', '艇番'}
        
        # Remove overlapping columns from programs (keep previews version)
        prog_to_merge = prog.drop(columns=list(overlap_cols))
        
        merged = prev.merge(
            prog_to_merge,
            on=['レースコード', '艇番'],
            how='left'
        )
        
        if merged.empty:
            continue
        
        # Step 2: Merge with results
        merged = merged.merge(
            res[['レースコード', '艇番', '着順']],
            on=['レースコード', '艇番'],
            how='left'
        )
        
        merged['日付'] = date_str
        combined_data.append(merged)
        
        # Count features (columns not in metadata)
        metadata_cols = {'レースコード', 'レース日', 'レース場', 'レース回', '艇番', '日付', '着順'}
        feature_count = len([c for c in merged.columns if c not in metadata_cols])
        print(f'✓ {date_str}: {merged.shape} (features: {feature_count})')
    except Exception as e:
        print(f'✗ {date_str}: {type(e).__name__}: {str(e)[:80]}')

if combined_data:
    final_data = pd.concat(combined_data, ignore_index=True)
    print(f'\nBefore filtering: {final_data.shape}')
    
    # Remove abnormal exhibition times (0 is invalid)
    initial_count = len(final_data)
    final_data = final_data[final_data['展示タイム'] != 0].reset_index(drop=True)
    removed_count = initial_count - len(final_data)
    
    print(f'Removed rows with 展示タイム = 0: {removed_count}')
    print(f'After filtering: {final_data.shape}')
    print(f'Dates: {final_data["日付"].nunique()}')
    print(f'Stadiums: {final_data["レース場"].nunique()}')
else:
    print('No data merged')

✓ 2025-01-01: (792, 46) (features: 39)
✓ 2025-01-02: (792, 46) (features: 39)
✓ 2025-01-03: (888, 46) (features: 39)
✓ 2025-01-04: (912, 46) (features: 39)
✓ 2025-01-05: (852, 46) (features: 39)
✓ 2025-01-06: (648, 46) (features: 39)
✓ 2025-01-07: (720, 46) (features: 39)
✓ 2025-01-08: (720, 46) (features: 39)
✓ 2025-01-09: (720, 46) (features: 39)
✓ 2025-01-10: (618, 46) (features: 39)
✓ 2025-01-11: (936, 46) (features: 39)
✓ 2025-01-12: (864, 46) (features: 39)
✓ 2025-01-13: (1008, 46) (features: 39)
✓ 2025-01-14: (936, 46) (features: 39)
✓ 2025-01-15: (792, 46) (features: 39)
✓ 2025-01-16: (864, 46) (features: 39)
✓ 2025-01-17: (1074, 46) (features: 39)
✓ 2025-01-18: (1074, 46) (features: 39)
✓ 2025-01-19: (1140, 46) (features: 39)
✓ 2025-01-20: (930, 46) (features: 39)
✓ 2025-01-21: (858, 46) (features: 39)
✓ 2025-01-22: (858, 46) (features: 39)
✓ 2025-01-23: (930, 46) (features: 39)
✓ 2025-01-24: (931, 46) (features: 39)
✓ 2025-01-25: (936, 46) (features: 39)
✓ 2025-01-26: (1080, 

## 展示タイム予測モデル

### 1. 特徴量準備

In [6]:
# Check columns and prepare features
print('=== データ準備 ===\n')
print(f'Final data shape: {final_data.shape}')
print(f'Total columns: {len(final_data.columns)}')

# Check for target variables (Previews data)
target_cols = ['展示タイム', 'コース', 'スタート展示', 'チルト調整']
print(f'\nTarget columns (to predict):')
for col in target_cols:
    if col in final_data.columns:
        non_null = final_data[col].notna().sum()
        total = len(final_data)
        print(f'  ✓ {col}: {final_data[col].dtype}, {non_null}/{total} ({non_null/total*100:.1f}%)')
    else:
        print(f'  ✗ {col}: NOT FOUND')

# Select features from Programs + Environment
exclude_cols = {
    'レースコード', 'レース日', 'レース場', 'レース回', 'タイトル',
    '艇番', '登録番号', '選手名', '支部',
    '着順',  # Result data
    '風向', '天候',  # Categorical - need encoding
    '展示タイム', 'コース', 'スタート展示', 'チルト調整',  # Target variables
    '体重(kg)', '体重調整(kg)',  # Preview-only data (target metadata)
}

feature_cols = [col for col in final_data.columns if col not in exclude_cols]
print(f'\nFeatures for prediction ({len(feature_cols)}):')
for i, col in enumerate(sorted(feature_cols)[:15], 1):  # Show first 15
    print(f'  {i:2d}. {col}')
if len(feature_cols) > 15:
    print(f'  ... and {len(feature_cols) - 15} more')

=== データ準備 ===

Final data shape: (255196, 46)
Total columns: 46

Target columns (to predict):
  ✓ 展示タイム: float64, 251444/255196 (98.5%)
  ✓ コース: float64, 251466/255196 (98.5%)
  ✓ スタート展示: float64, 251437/255196 (98.5%)
  ✓ チルト調整: float64, 251717/255196 (98.6%)

Features for prediction (29):
   1. ボート2連対率
   2. ボート番号
   3. モーター2連対率
   4. モーター番号
   5. 今節成績_1-1
   6. 今節成績_1-2
   7. 今節成績_2-1
   8. 今節成績_2-2
   9. 今節成績_3-1
  10. 今節成績_3-2
  11. 今節成績_4-1
  12. 今節成績_4-2
  13. 今節成績_5-1
  14. 今節成績_5-2
  15. 今節成績_6-1
  ... and 14 more


### 2. 展示タイム予測モデル構築

In [7]:
# Prepare features for exhibition time prediction
print('=== 展示タイム予測モデル準備 ===\n')

if '展示タイム' not in final_data.columns:
    print('ERROR: 展示タイム not found!')
    raise KeyError('展示タイム column missing')

# Prepare X and y
X = final_data[feature_cols].copy()
y = final_data['展示タイム'].copy()

# Convert to numeric and fill NaN
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

y = pd.to_numeric(y, errors='coerce')

# Fill NaN with column medians
for col in X.columns:
    median_val = X[col].median()
    X[col].fillna(median_val if pd.notna(median_val) else 0, inplace=True)

# Remove rows with missing target
valid = y.notna()
X = X[valid].reset_index(drop=True)
y = y[valid].reset_index(drop=True)

print(f'Features: {len(X.columns)}')
print(f'Samples: {len(X)}')
print(f'Target (展示タイム) stats:')
print(f'  Mean: {y.mean():.3f}s')
print(f'  Std: {y.std():.3f}s')
print(f'  Min: {y.min():.3f}s')
print(f'  Max: {y.max():.3f}s')

=== 展示タイム予測モデル準備 ===

Features: 29
Samples: 251444
Target (展示タイム) stats:
  Mean: 6.816s
  Std: 0.118s
  Min: 6.340s
  Max: 8.670s


### 3. 展示タイム予測モデル - レース場別訓練

In [8]:
# Train exhibition time models per stadium
print('\n=== 展示タイム予測モデル訓練 ===\n')

stadiums = sorted(final_data['レース場'].dropna().unique())
exhibition_models = {}
results_summary = []

for stadium in stadiums:
    stadium_mask = final_data['レース場'] == stadium
    X_std = X[stadium_mask].reset_index(drop=True)
    y_std = y[stadium_mask].reset_index(drop=True)
    
    if len(X_std) < 100:
        print(f'Stadium {int(stadium):2d}: insufficient data ({len(X_std)} samples) - SKIP')
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    # Train GBRegressor
    gbr = GradientBoostingRegressor(n_estimators=150, learning_rate=0.05, max_depth=6, subsample=0.8, random_state=42)
    gbr.fit(X_train_s, y_train)
    
    y_pred = gbr.predict(X_test_s)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    exhibition_models[stadium] = {'model': gbr, 'scaler': scaler, 'features': list(X.columns)}
    results_summary.append({'stadium': int(stadium), 'samples': len(X_std), 'mae': mae, 'rmse': rmse})
    
    status = '✓' if mae < 0.05 else '⚠' if mae < 0.10 else '✗'
    print(f'{status} Stadium {int(stadium):2d}: MAE={mae:.4f}s, RMSE={rmse:.4f}s ({len(X_std)} samples)')

if results_summary:
    results_df = pd.DataFrame(results_summary)
    print(f'\n=== Summary ===')
    print(f'Models trained: {len(results_df)}/{len(stadiums)}')
    print(f'Average MAE: {results_df["mae"].mean():.4f}s')
    print(f'Average RMSE: {results_df["rmse"].mean():.4f}s')
else:
    print('ERROR: No models trained!')


=== 展示タイム予測モデル訓練 ===

⚠ Stadium  1: MAE=0.0750s, RMSE=0.0973s (13048 samples)
⚠ Stadium  2: MAE=0.0759s, RMSE=0.0978s (14556 samples)
⚠ Stadium  3: MAE=0.0728s, RMSE=0.0936s (14112 samples)
⚠ Stadium  4: MAE=0.0765s, RMSE=0.0984s (11736 samples)
⚠ Stadium  5: MAE=0.0751s, RMSE=0.0987s (13824 samples)
⚠ Stadium  6: MAE=0.0738s, RMSE=0.0945s (14268 samples)
⚠ Stadium  7: MAE=0.0747s, RMSE=0.0951s (14052 samples)
⚠ Stadium  8: MAE=0.0753s, RMSE=0.1001s (15108 samples)
⚠ Stadium  9: MAE=0.0753s, RMSE=0.0957s (15134 samples)
⚠ Stadium 10: MAE=0.0763s, RMSE=0.0985s (13968 samples)
⚠ Stadium 11: MAE=0.0761s, RMSE=0.0975s (13254 samples)
⚠ Stadium 12: MAE=0.0685s, RMSE=0.0895s (3744 samples)
⚠ Stadium 13: MAE=0.0736s, RMSE=0.0937s (3672 samples)
⚠ Stadium 14: MAE=0.0740s, RMSE=0.0966s (7056 samples)
⚠ Stadium 15: MAE=0.0771s, RMSE=0.0981s (10614 samples)
⚠ Stadium 16: MAE=0.0749s, RMSE=0.0961s (7776 samples)
⚠ Stadium 17: MAE=0.0751s, RMSE=0.0990s (7392 samples)
⚠ Stadium 18: MAE=0.0737s, RMS

## 進入コース予測モデル

In [None]:
# Prepare enhanced features for course entry prediction
print('\n=== 進入コース予測モデル準備（強化特徴量版）===\n')

if 'コース' not in final_data.columns:
    print('ERROR: コース not found!')
    raise KeyError('コース column missing')

# Start with base features (same as exhibition time)
X_course = final_data[feature_cols].copy()
y_course = final_data['コース'].copy()

# Add frame number as a feature (枠番 = boat number)
X_course['枠番'] = final_data['艇番']

# Add player's past course entry tendency
# Calculate the most common course for each player
player_course_tendency = final_data.groupby('登録番号')['コース'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else x.median()).reset_index()
player_course_tendency.columns = ['登録番号', 'プレイヤー進入傾向コース']
X_course_temp = final_data[['登録番号']].copy()
X_course_temp = X_course_temp.merge(player_course_tendency, on='登録番号', how='left')
X_course['プレイヤー進入傾向'] = X_course_temp['プレイヤー進入傾向コース']

# Add stadium-level average course pattern
# For each stadium, calculate average course per frame position
stadium_frame_course = final_data.groupby(['レース場', '艇番'])['コース'].mean().reset_index()
stadium_frame_course.columns = ['レース場', '艇番', 'スタジアム枠別平均コース']
X_course_temp = final_data[['レース場', '艇番']].copy()
X_course_temp = X_course_temp.merge(stadium_frame_course, on=['レース場', '艇番'], how='left')
X_course['スタジアム枠別平均'] = X_course_temp['スタジアム枠別平均コース']

# Add player win rate features (interaction with frame)
X_course['全国勝率×枠'] = final_data['全国勝率'].fillna(0) * final_data['艇番']
X_course['当地勝率×枠'] = final_data['当地勝率'].fillna(0) * final_data['艇番']

# Convert to numeric
for col in X_course.columns:
    X_course[col] = pd.to_numeric(X_course[col], errors='coerce')

y_course = pd.to_numeric(y_course, errors='coerce')

# Fill NaN
for col in X_course.columns:
    median_val = X_course[col].median()
    X_course[col].fillna(median_val if pd.notna(median_val) else 0, inplace=True)

# Remove rows with missing target
valid_course = y_course.notna()
X_course = X_course[valid_course].reset_index(drop=True)
y_course = y_course[valid_course].reset_index(drop=True)
y_course = y_course.astype(int)

print(f'Base features: {len(feature_cols)}')
print(f'Added features: 5 (枠番, プレイヤー進入傾向, スタジアム枠別平均, 全国勝率×枠, 当地勝率×枠)')
print(f'Total features: {len(X_course.columns)}')
print(f'Samples: {len(X_course)}')
print(f'Target (コース) distribution:')
for course in sorted(y_course.unique()):
    count = (y_course == course).sum()
    pct = count / len(y_course) * 100
    print(f'  Course {int(course)}: {count} ({pct:.1f}%)')

### 進入コース予測モデル - レース場別訓練

In [None]:
# Train course entry models per stadium - improved hyperparameters
print('\n=== 進入コース予測モデル訓練（改善版）===\n')

course_models = {}
results_summary_course = []

for stadium in stadiums:
    stadium_mask = final_data['レース場'] == stadium
    X_std = X_course[stadium_mask].reset_index(drop=True)
    y_std = y_course[stadium_mask].reset_index(drop=True)
    
    if len(X_std) < 100:
        print(f'Stadium {int(stadium):2d}: insufficient data ({len(X_std)} samples) - SKIP')
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    # Train GBClassifier with optimized hyperparameters
    # Increased n_estimators, max_depth, and learning_rate for better feature interaction
    gbc = GradientBoostingClassifier(
        n_estimators=250,           # Increased from 150
        learning_rate=0.1,          # Keep at 0.1 for stability
        max_depth=8,                # Increased from 6 for deeper trees
        min_samples_split=10,       # Prevent overfitting
        min_samples_leaf=5,         # Prevent overfitting
        subsample=0.7,              # Reduce overfitting with sampling
        max_features='sqrt',        # Use sqrt of features
        validation_fraction=0.1,    # Early stopping with validation
        n_iter_no_change=20,        # Early stopping patience
        random_state=42,
        verbose=0
    )
    
    gbc.fit(X_train_s, y_train)
    
    y_pred = gbc.predict(X_test_s)
    acc = accuracy_score(y_test, y_pred)
    
    course_models[stadium] = {'model': gbc, 'scaler': scaler, 'features': list(X_course.columns)}
    results_summary_course.append({'stadium': int(stadium), 'samples': len(X_std), 'accuracy': acc})
    
    status = '✓' if acc > 0.70 else '⚠' if acc > 0.50 else '✗'
    print(f'{status} Stadium {int(stadium):2d}: Accuracy={acc:.1%} ({len(X_std)} samples)')

if results_summary_course:
    results_course_df = pd.DataFrame(results_summary_course)
    print(f'\n=== Summary ===')
    print(f'Models trained: {len(results_course_df)}/{len(stadiums)}')
    print(f'Average Accuracy: {results_course_df["accuracy"].mean():.1%}')
    print(f'Min Accuracy: {results_course_df["accuracy"].min():.1%}')
    print(f'Max Accuracy: {results_course_df["accuracy"].max():.1%}')
else:
    print('ERROR: No course models trained!')

## スタート展示予測モデル

### 1. 特徴量準備

In [None]:
# Prepare enhanced features for start timing prediction
print('\n=== スタート展示予測モデル準備（強化特徴量版）===\n')

if 'スタート展示' not in final_data.columns:
    print('ERROR: スタート展示 not found!')
    raise KeyError('スタート展示 column missing')

# Start with base features
X_start = final_data[feature_cols].copy()
y_start = final_data['スタート展示'].copy()

# Add frame number as a feature
X_start['枠番'] = final_data['艇番']

# Add player's past start timing tendency
player_start_tendency = final_data.groupby('登録番号')['スタート展示'].agg('mean').reset_index()
player_start_tendency.columns = ['登録番号', 'プレイヤースタート平均']
X_start_temp = final_data[['登録番号']].copy()
X_start_temp = X_start_temp.merge(player_start_tendency, on='登録番号', how='left')
X_start['プレイヤースタート傾向'] = X_start_temp['プレイヤースタート平均']

# Add stadium-level average start timing per frame
stadium_frame_start = final_data.groupby(['レース場', '艇番'])['スタート展示'].mean().reset_index()
stadium_frame_start.columns = ['レース場', '艇番', 'スタジアム枠別平均スタート']
X_start_temp = final_data[['レース場', '艇番']].copy()
X_start_temp = X_start_temp.merge(stadium_frame_start, on=['レース場', '艇番'], how='left')
X_start['スタジアム枠別平均'] = X_start_temp['スタジアム枠別平均スタート']

# Add age interaction features
X_start['年齢×枠'] = final_data['年齢'].fillna(0) * final_data['艇番']
X_start['経験年数×枠'] = final_data['経験年数'].fillna(0) * final_data['艇番']

# Convert to numeric
for col in X_start.columns:
    X_start[col] = pd.to_numeric(X_start[col], errors='coerce')

y_start = pd.to_numeric(y_start, errors='coerce')

# Fill NaN
for col in X_start.columns:
    median_val = X_start[col].median()
    X_start[col].fillna(median_val if pd.notna(median_val) else 0, inplace=True)

# Remove rows with missing target
valid_start = y_start.notna()
X_start = X_start[valid_start].reset_index(drop=True)
y_start = y_start[valid_start].reset_index(drop=True)

print(f'Base features: {len(feature_cols)}')
print(f'Added features: 5 (枠番, プレイヤースタート傾向, スタジアム枠別平均, 年齢×枠, 経験年数×枠)')
print(f'Total features: {len(X_start.columns)}')
print(f'Samples: {len(X_start)}')
print(f'Target (スタート展示) stats:')
print(f'  Mean: {y_start.mean():.3f}s')
print(f'  Std: {y_start.std():.3f}s')
print(f'  Min: {y_start.min():.3f}s')
print(f'  Max: {y_start.max():.3f}s')

### 2. スタート展示予測モデル - レース場別訓練

In [None]:
# Train start timing models per stadium
print('\n=== スタート展示予測モデル訓練 ===\n')

start_timing_models = {}
results_summary_start = []

for stadium in stadiums:
    stadium_mask = final_data['レース場'] == stadium
    X_std = X_start[stadium_mask].reset_index(drop=True)
    y_std = y_start[stadium_mask].reset_index(drop=True)
    
    if len(X_std) < 100:
        print(f'Stadium {int(stadium):2d}: insufficient data ({len(X_std)} samples) - SKIP')
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    # Train GBRegressor for start timing
    gbr_start = GradientBoostingRegressor(
        n_estimators=150,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        random_state=42
    )
    gbr_start.fit(X_train_s, y_train)
    
    y_pred = gbr_start.predict(X_test_s)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    start_timing_models[stadium] = {'model': gbr_start, 'scaler': scaler, 'features': list(X_start.columns)}
    results_summary_start.append({'stadium': int(stadium), 'samples': len(X_std), 'mae': mae, 'rmse': rmse})
    
    status = '✓' if mae < 0.10 else '⚠' if mae < 0.15 else '✗'
    print(f'{status} Stadium {int(stadium):2d}: MAE={mae:.4f}s, RMSE={rmse:.4f}s ({len(X_std)} samples)')

if results_summary_start:
    results_start_df = pd.DataFrame(results_summary_start)
    print(f'\n=== Summary ===')
    print(f'Models trained: {len(results_start_df)}/{len(stadiums)}')
    print(f'Average MAE: {results_start_df["mae"].mean():.4f}s')
    print(f'Average RMSE: {results_start_df["rmse"].mean():.4f}s')
else:
    print('ERROR: No start timing models trained!')

## チルト調整予測モデル

### 1. 特徴量準備

In [None]:
# Prepare enhanced features for tilt adjustment prediction
print('\n=== チルト調整予測モデル準備（分類タスク）===\n')

if 'チルト調整' not in final_data.columns:
    print('ERROR: チルト調整 not found!')
    raise KeyError('チルト調整 column missing')

# Start with base features
X_tilt = final_data[feature_cols].copy()
y_tilt = final_data['チルト調整'].copy()

# Add frame number
X_tilt['枠番'] = final_data['艇番']

# Add player's past tilt adjustment tendency
player_tilt_tendency = final_data.groupby('登録番号')['チルト調整'].agg('mean').reset_index()
player_tilt_tendency.columns = ['登録番号', 'プレイヤーチルト平均']
X_tilt_temp = final_data[['登録番号']].copy()
X_tilt_temp = X_tilt_temp.merge(player_tilt_tendency, on='登録番号', how='left')
X_tilt['プレイヤーチルト傾向'] = X_tilt_temp['プレイヤーチルト平均']

# Add stadium-level average tilt adjustment per frame
stadium_frame_tilt = final_data.groupby(['レース場', '艇番'])['チルト調整'].mean().reset_index()
stadium_frame_tilt.columns = ['レース場', '艇番', 'スタジアム枠別平均チルト']
X_tilt_temp = final_data[['レース場', '艇番']].copy()
X_tilt_temp = X_tilt_temp.merge(stadium_frame_tilt, on=['レース場', '艇番'], how='left')
X_tilt['スタジアム枠別平均'] = X_tilt_temp['スタジアム枠別平均チルト']

# Add performance interaction
X_tilt['全国勝率×枠'] = final_data['全国勝率'].fillna(0) * final_data['艇番']
X_tilt['当地勝率×枠'] = final_data['当地勝率'].fillna(0) * final_data['艇番']

# Convert to numeric
for col in X_tilt.columns:
    X_tilt[col] = pd.to_numeric(X_tilt[col], errors='coerce')

y_tilt = pd.to_numeric(y_tilt, errors='coerce')

# Round target to nearest 0.5 for classification
# Create categorical labels: -1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0, etc.
y_tilt_rounded = (y_tilt * 2).round() / 2  # Round to nearest 0.5

# Fill NaN
for col in X_tilt.columns:
    median_val = X_tilt[col].median()
    X_tilt[col].fillna(median_val if pd.notna(median_val) else 0, inplace=True)

# Remove rows with missing target
valid_tilt = y_tilt_rounded.notna()
X_tilt = X_tilt[valid_tilt].reset_index(drop=True)
y_tilt_rounded = y_tilt_rounded[valid_tilt].reset_index(drop=True)

print(f'Base features: {len(feature_cols)}')
print(f'Added features: 5 (枠番, プレイヤーチルト傾向, スタジアム枠別平均, 全国勝率×枠, 当地勝率×枠)')
print(f'Total features: {len(X_tilt.columns)}')
print(f'Samples: {len(X_tilt)}')
print(f'Target (チルト調整) classes (rounded to 0.5):')
for val in sorted(y_tilt_rounded.unique()):
    count = (y_tilt_rounded == val).sum()
    pct = count / len(y_tilt_rounded) * 100
    print(f'  {val:+.1f}: {count} ({pct:.1f}%)')

### 2. チルト調整予測モデル - レース場別訓練

In [None]:
# Train tilt adjustment models per stadium
print('\n=== チルト調整予測モデル訓練 ===\n')

tilt_adjustment_models = {}
results_summary_tilt = []

for stadium in stadiums:
    stadium_mask = final_data['レース場'] == stadium
    X_std = X_tilt[stadium_mask].reset_index(drop=True)
    y_std = y_tilt_rounded[stadium_mask].reset_index(drop=True)
    
    if len(X_std) < 100:
        print(f'Stadium {int(stadium):2d}: insufficient data ({len(X_std)} samples) - SKIP')
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    # Train GBClassifier for tilt adjustment
    gbc_tilt = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.08,
        max_depth=7,
        min_samples_split=10,
        min_samples_leaf=5,
        subsample=0.75,
        random_state=42
    )
    gbc_tilt.fit(X_train_s, y_train)
    
    y_pred = gbc_tilt.predict(X_test_s)
    acc = accuracy_score(y_test, y_pred)
    
    tilt_adjustment_models[stadium] = {'model': gbc_tilt, 'scaler': scaler, 'features': list(X_tilt.columns)}
    results_summary_tilt.append({'stadium': int(stadium), 'samples': len(X_std), 'accuracy': acc})
    
    status = '✓' if acc > 0.60 else '⚠' if acc > 0.45 else '✗'
    print(f'{status} Stadium {int(stadium):2d}: Accuracy={acc:.1%} ({len(X_std)} samples)')

if results_summary_tilt:
    results_tilt_df = pd.DataFrame(results_summary_tilt)
    print(f'\n=== Summary ===')
    print(f'Models trained: {len(results_tilt_df)}/{len(stadiums)}')
    print(f'Average Accuracy: {results_tilt_df["accuracy"].mean():.1%}')
    print(f'Min Accuracy: {results_tilt_df["accuracy"].min():.1%}')
    print(f'Max Accuracy: {results_tilt_df["accuracy"].max():.1%}')
else:
    print('ERROR: No tilt adjustment models trained!')

## モデル保存

In [11]:
# Save models
models_data = {
    'exhibition_time': exhibition_models,
    'course_entry': course_models,
    'start_timing': start_timing_models,
    'tilt_adjustment': tilt_adjustment_models
}

model_save_path = repo_root / 'models' / 'preview_models.pkl'
model_save_path.parent.mkdir(parents=True, exist_ok=True)

with open(model_save_path, 'wb') as f:
    pickle.dump(models_data, f)

print(f'✓ Saved models to {model_save_path}')
print(f'  Exhibition time models: {len(exhibition_models)}')
print(f'  Course entry models: {len(course_models)}')
print(f'  Start timing models: {len(start_timing_models)}')
print(f'  Tilt adjustment models: {len(tilt_adjustment_models)}')

✓ Saved models to /Users/mahiguch/dev/boatrace/data/models/preview_models.pkl
  Exhibition time models: 24
  Course entry models: 24


## 2026年1月のテストデータで予測

### 1. テストデータ読み込み

In [12]:
# Load test data for 2026-01
test_data_list = []

year_test = '2026'
month_test = '01'
month_num = int(month_test)
year_num = int(year_test)

_, max_day = calendar.monthrange(year_num, month_num)

for day in range(1, max_day + 1):
    day_str = f'{day:02d}'
    prog_path = repo_root / 'data' / 'programs' / year_test / month_test / f'{day_str}.csv'
    
    if prog_path.exists():
        try:
            prog_test = pd.read_csv(prog_path)
            test_data_list.append((day_str, prog_test))
        except Exception as e:
            print(f'Error loading {year_test}-{month_test}-{day_str}: {e}')

print(f'✓ Loaded {len(test_data_list)} days of test data for 2026-01')

✓ Loaded 31 days of test data for 2026-01


### 2. テストデータの変形

In [15]:
# Reshape test programs and add environment info from previews
test_programs_list = []

for day, prog_test in test_data_list:
    prog_reshaped = reshape_programs(prog_test)
    
    if not prog_reshaped.empty:
        # Map stadium
        prog_reshaped['レース場'] = prog_reshaped['レース場'].apply(map_stadium_name_to_number)
        prog_reshaped = prog_reshaped[prog_reshaped['レース場'].notna()].reset_index(drop=True)
        
        # Load previews for this day to get environment info
        prev_path = repo_root / 'data' / 'previews' / '2026' / '01' / f'{day}.csv'
        if prev_path.exists():
            try:
                prev_test = pd.read_csv(prev_path)
                prev_reshaped = reshape_previews(prev_test)
                
                # Extract environment columns from previews
                environment_cols = ['レースコード', '風速(m)', '波の高さ(cm)', '気温(℃)', '水温(℃)']
                available_env = [c for c in environment_cols if c in prev_reshaped.columns]
                
                if available_env:
                    prev_env = prev_reshaped[available_env].drop_duplicates()
                    prog_reshaped = prog_reshaped.merge(prev_env, on='レースコード', how='left')
            except Exception as e:
                pass
        
        # Add date column
        prog_reshaped['日付'] = day
        
        if not prog_reshaped.empty:
            test_programs_list.append(prog_reshaped)

if test_programs_list:
    test_programs = pd.concat(test_programs_list, ignore_index=True)
    print(f'✓ Test programs reshaped: {test_programs.shape}')
else:
    print('✗ No test programs')

✓ Test programs reshaped: (31350, 37)


### 3. 特徴量準備と予測

In [None]:
# Load trained models and prepare test features with enhanced features
with open(model_save_path, 'rb') as f:
    models_data = pickle.load(f)

exhibition_models = models_data['exhibition_time']
course_models = models_data['course_entry']
start_timing_models = models_data['start_timing']
tilt_adjustment_models = models_data['tilt_adjustment']

print(f'✓ Loaded exhibition time models: {len(exhibition_models)}')
print(f'✓ Loaded course entry models: {len(course_models)}')
print(f'✓ Loaded start timing models: {len(start_timing_models)}')
print(f'✓ Loaded tilt adjustment models: {len(tilt_adjustment_models)}')

# Get expected feature lists from trained models
expected_course_features = course_models[list(course_models.keys())[0]]['features']
expected_start_features = start_timing_models[list(start_timing_models.keys())[0]]['features']
expected_tilt_features = tilt_adjustment_models[list(tilt_adjustment_models.keys())[0]]['features']

print(f'Expected course features: {len(expected_course_features)}')
print(f'Expected start timing features: {len(expected_start_features)}')
print(f'Expected tilt adjustment features: {len(expected_tilt_features)}')

# Prepare test features for start timing prediction
X_test_start = test_programs[feature_cols].copy()
X_test_start['枠番'] = test_programs['艇番']

# Add player's start timing tendency
player_start_tendency = final_data.groupby('登録番号')['スタート展示'].agg('mean').reset_index()
player_start_tendency.columns = ['登録番号', 'プレイヤースタート平均']
X_start_temp = test_programs[['登録番号']].copy()
X_start_temp = X_start_temp.merge(player_start_tendency, on='登録番号', how='left')
X_test_start['プレイヤースタート傾向'] = X_start_temp['プレイヤースタート平均']

# Add stadium-frame average start timing
stadium_frame_start = final_data.groupby(['レース場', '艇番'])['スタート展示'].mean().reset_index()
stadium_frame_start.columns = ['レース場', '艇番', 'スタジアム枠別平均スタート']
X_start_temp = test_programs[['レース場', '艇番']].copy()
X_start_temp = X_start_temp.merge(stadium_frame_start, on=['レース場', '艇番'], how='left')
X_test_start['スタジアム枠別平均'] = X_start_temp['スタジアム枠別平均スタート']

# Add age interactions
X_test_start['年齢×枠'] = test_programs['年齢'].fillna(0) * test_programs['艇番']
X_test_start['経験年数×枠'] = test_programs['経験年数'].fillna(0) * test_programs['艇番']

# Convert to numeric and fill NaN
for col in X_test_start.columns:
    X_test_start[col] = pd.to_numeric(X_test_start[col], errors='coerce')

for col in X_test_start.columns:
    if col in X_start.columns:
        median_val = X_start[col].median()
    else:
        median_val = np.nan
    
    if pd.isna(median_val):
        X_test_start[col].fillna(0, inplace=True)
    else:
        X_test_start[col].fillna(median_val, inplace=True)

# Prepare test features for tilt adjustment prediction
X_test_tilt = test_programs[feature_cols].copy()
X_test_tilt['枠番'] = test_programs['艇番']

# Add player's tilt tendency
player_tilt_tendency = final_data.groupby('登録番号')['チルト調整'].agg('mean').reset_index()
player_tilt_tendency.columns = ['登録番号', 'プレイヤーチルト平均']
X_tilt_temp = test_programs[['登録番号']].copy()
X_tilt_temp = X_tilt_temp.merge(player_tilt_tendency, on='登録番号', how='left')
X_test_tilt['プレイヤーチルト傾向'] = X_tilt_temp['プレイヤーチルト平均']

# Add stadium-frame average tilt
stadium_frame_tilt = final_data.groupby(['レース場', '艇番'])['チルト調整'].mean().reset_index()
stadium_frame_tilt.columns = ['レース場', '艇番', 'スタジアム枠別平均チルト']
X_tilt_temp = test_programs[['レース場', '艇番']].copy()
X_tilt_temp = X_tilt_temp.merge(stadium_frame_tilt, on=['レース場', '艇番'], how='left')
X_test_tilt['スタジアム枠別平均'] = X_tilt_temp['スタジアム枠別平均チルト']

# Add performance interactions
X_test_tilt['全国勝率×枠'] = test_programs['全国勝率'].fillna(0) * test_programs['艇番']
X_test_tilt['当地勝率×枠'] = test_programs['当地勝率'].fillna(0) * test_programs['艇番']

# Convert to numeric and fill NaN
for col in X_test_tilt.columns:
    X_test_tilt[col] = pd.to_numeric(X_test_tilt[col], errors='coerce')

for col in X_test_tilt.columns:
    if col in X_tilt.columns:
        median_val = X_tilt[col].median()
    else:
        median_val = np.nan
    
    if pd.isna(median_val):
        X_test_tilt[col].fillna(0, inplace=True)
    else:
        X_test_tilt[col].fillna(median_val, inplace=True)

# Prepare for exhibition time and course prediction (use original features)
X_test_exhibition = test_programs[feature_cols].copy()
for col in X_test_exhibition.columns:
    X_test_exhibition[col] = pd.to_numeric(X_test_exhibition[col], errors='coerce')

for col in X_test_exhibition.columns:
    if col in X.columns:
        median_val = X[col].median()
    else:
        median_val = np.nan
    
    if pd.isna(median_val):
        X_test_exhibition[col].fillna(0, inplace=True)
    else:
        X_test_exhibition[col].fillna(median_val, inplace=True)

# Prepare for course prediction (uses enhanced features)
X_test_course = test_programs[feature_cols].copy()
X_test_course['枠番'] = test_programs['艇番']

# Add player's course entry tendency
player_course_tendency = final_data.groupby('登録番号')['コース'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else x.median()).reset_index()
player_course_tendency.columns = ['登録番号', 'プレイヤー進入傾向コース']
X_course_temp = test_programs[['登録番号']].copy()
X_course_temp = X_course_temp.merge(player_course_tendency, on='登録番号', how='left')
X_test_course['プレイヤー進入傾向'] = X_course_temp['プレイヤー進入傾向コース']

# Add stadium-frame average course
stadium_frame_course = final_data.groupby(['レース場', '艇番'])['コース'].mean().reset_index()
stadium_frame_course.columns = ['レース場', '艇番', 'スタジアム枠別平均コース']
X_course_temp = test_programs[['レース場', '艇番']].copy()
X_course_temp = X_course_temp.merge(stadium_frame_course, on=['レース場', '艇番'], how='left')
X_test_course['スタジアム枠別平均'] = X_course_temp['スタジアム枠別平均コース']

# Add player win rate interactions
X_test_course['全国勝率×枠'] = test_programs['全国勝率'].fillna(0) * test_programs['艇番']
X_test_course['当地勝率×枠'] = test_programs['当地勝率'].fillna(0) * test_programs['艇番']

# Convert to numeric and fill NaN for course
for col in X_test_course.columns:
    X_test_course[col] = pd.to_numeric(X_test_course[col], errors='coerce')

for col in X_test_course.columns:
    if col in X_course.columns:
        median_val = X_course[col].median()
    else:
        median_val = np.nan
    
    if pd.isna(median_val):
        X_test_course[col].fillna(0, inplace=True)
    else:
        X_test_course[col].fillna(median_val, inplace=True)

print(f'✓ Test features prepared for exhibition time: {X_test_exhibition.shape}')
print(f'✓ Test features prepared for course entry: {X_test_course.shape}')
print(f'✓ Test features prepared for start timing: {X_test_start.shape}')
print(f'✓ Test features prepared for tilt adjustment: {X_test_tilt.shape}')

### 4. 展示タイム予測

In [17]:
# Predict exhibition times
exhibition_predictions = []
success_count = 0
error_count = 0

for idx, row in test_programs.iterrows():
    stadium = row['レース場']
    
    if stadium not in exhibition_models:
        exhibition_predictions.append(np.nan)
        continue
    
    model_info = exhibition_models[stadium]
    model = model_info['model']
    scaler = model_info['scaler']
    
    try:
        X_row = X_test_exhibition.iloc[idx:idx+1]
        X_scaled = scaler.transform(X_row)
        pred = model.predict(X_scaled)[0]
        
        # Clamp to reasonable range (e.g., 5.0 to 8.0 seconds)
        pred = max(5.0, min(8.0, pred))
        
        exhibition_predictions.append(pred)
        success_count += 1
    except Exception as e:
        exhibition_predictions.append(np.nan)
        error_count += 1

test_programs['展示タイム'] = exhibition_predictions

print(f'✓ Exhibition time predictions: {success_count} successful, {error_count} errors')
print(f'  Valid predictions: {test_programs["展示タイム"].notna().sum()}/{len(test_programs)}')
if test_programs["展示タイム"].notna().sum() > 0:
    print(f'  Mean: {test_programs["展示タイム"].mean():.3f}s, Std: {test_programs["展示タイム"].std():.3f}s')

✓ Exhibition time predictions: 31350 successful, 0 errors
  Valid predictions: 31350/31350
  Mean: 6.794s, Std: 0.050s


### 5. 進入コース予測

In [18]:
# Predict course entries
course_predictions = []
success_count = 0
error_count = 0

for idx, row in test_programs.iterrows():
    stadium = row['レース場']
    
    if stadium not in course_models:
        course_predictions.append(np.nan)
        continue
    
    model_info = course_models[stadium]
    model = model_info['model']
    scaler = model_info['scaler']
    
    try:
        X_row = X_test_course.iloc[idx:idx+1]
        X_scaled = scaler.transform(X_row)
        pred = model.predict(X_scaled)[0]
        
        # Ensure it's in valid range (1-6)
        pred = max(1, min(6, int(pred)))
        
        course_predictions.append(pred)
        success_count += 1
    except Exception as e:
        course_predictions.append(np.nan)
        error_count += 1

test_programs['コース'] = course_predictions

print(f'✓ Course entry predictions: {success_count} successful, {error_count} errors')
print(f'  Valid predictions: {test_programs["コース"].notna().sum()}/{len(test_programs)}')
if test_programs["コース"].notna().sum() > 0:
    print(f'  Distribution: {test_programs["コース"].value_counts().sort_index().to_dict()}')

✓ Course entry predictions: 31350 successful, 0 errors
  Valid predictions: 31350/31350
  Distribution: {1: 6189, 2: 5031, 3: 5088, 4: 5398, 5: 5372, 6: 4272}


### 6. スタート展示予測

In [None]:
# Predict start timings
start_timing_predictions = []
success_count = 0
error_count = 0

for idx, row in test_programs.iterrows():
    stadium = row['レース場']
    
    if stadium not in start_timing_models:
        start_timing_predictions.append(np.nan)
        continue
    
    model_info = start_timing_models[stadium]
    model = model_info['model']
    scaler = model_info['scaler']
    
    try:
        X_row = X_test_start.iloc[idx:idx+1]
        X_scaled = scaler.transform(X_row)
        pred = model.predict(X_scaled)[0]
        
        # Clamp to reasonable range (e.g., -0.5 to 1.0 seconds)
        pred = max(-0.5, min(1.0, pred))
        
        start_timing_predictions.append(pred)
        success_count += 1
    except Exception as e:
        start_timing_predictions.append(np.nan)
        error_count += 1

test_programs['スタート展示'] = start_timing_predictions

print(f'✓ Start timing predictions: {success_count} successful, {error_count} errors')
print(f'  Valid predictions: {test_programs["スタート展示"].notna().sum()}/{len(test_programs)}')
if test_programs["スタート展示"].notna().sum() > 0:
    print(f'  Mean: {test_programs["スタート展示"].mean():.3f}s, Std: {test_programs["スタート展示"].std():.3f}s')

### 7. チルト調整予測

In [None]:
# Predict tilt adjustments
tilt_adjustment_predictions = []
success_count = 0
error_count = 0

for idx, row in test_programs.iterrows():
    stadium = row['レース場']
    
    if stadium not in tilt_adjustment_models:
        tilt_adjustment_predictions.append(np.nan)
        continue
    
    model_info = tilt_adjustment_models[stadium]
    model = model_info['model']
    scaler = model_info['scaler']
    
    try:
        X_row = X_test_tilt.iloc[idx:idx+1]
        X_scaled = scaler.transform(X_row)
        pred = model.predict(X_scaled)[0]
        
        # Pred is already rounded to nearest 0.5 during training
        # Clamp to reasonable range (e.g., -1.0 to 2.5)
        pred = max(-1.0, min(2.5, float(pred)))
        
        tilt_adjustment_predictions.append(pred)
        success_count += 1
    except Exception as e:
        tilt_adjustment_predictions.append(np.nan)
        error_count += 1

test_programs['チルト調整'] = tilt_adjustment_predictions

print(f'✓ Tilt adjustment predictions: {success_count} successful, {error_count} errors')
print(f'  Valid predictions: {test_programs["チルト調整"].notna().sum()}/{len(test_programs)}')
if test_programs["チルト調整"].notna().sum() > 0:
    print(f'  Distribution:')
    dist = test_programs["チルト調整"][test_programs["チルト調整"].notna()].round(1).value_counts().sort_index()
    for val, count in dist.items():
        print(f'    {val:+.1f}: {count}')

## 予測Previews形式での出力

In [19]:
# Prepare predicted previews output
# Output format: one row per race, with columns for each boat

output_data = {}

for race_code in test_programs['レースコード'].unique():
    race_programs = test_programs[test_programs['レースコード'] == race_code]
    
    if len(race_programs) == 0:
        continue
    
    # Get race-level info from first boat
    first_row = race_programs.iloc[0]
    race_info = {
        'レースコード': race_code,
        'レース日': first_row['レース日'],
        'レース場': int(first_row['レース場']),
        'レース回': first_row['レース回']
    }
    
    # Add boat-specific predictions for all 4 models
    for _, row in race_programs.iterrows():
        boat_num = int(row['艇番'])
        race_info[f'艇{boat_num}_展示タイム'] = row['展示タイム']
        race_info[f'艇{boat_num}_コース'] = row['コース']
        race_info[f'艇{boat_num}_スタート展示'] = row['スタート展示']
        race_info[f'艇{boat_num}_チルト調整'] = row['チルト調整']
    
    output_data[race_code] = race_info

# Convert to DataFrame
output_df = pd.DataFrame(list(output_data.values()))

# Sort columns: race info first, then by boat and prediction type
cols_first = ['レースコード', 'レース日', 'レース場', 'レース回']
cols_other = sorted([c for c in output_df.columns if c not in cols_first])
output_df = output_df[cols_first + cols_other]

print(f'✓ Prepared output: {output_df.shape}')
print(f'  Sample:')
print(output_df.head(3))

✓ Prepared output: (5128, 16)
  Sample:
         レースコード        レース日  レース場 レース回  艇1_コース  艇1_展示タイム  艇2_コース  艇2_展示タイム  \
0  202601012301  2026-01-01    23   1R       6  6.864569       3  6.810665   
1  202601012302  2026-01-01    23   2R       4  6.855010       3  6.870603   
2  202601012303  2026-01-01    23   3R       2  6.884915       2  6.904967   

   艇3_コース  艇3_展示タイム  艇4_コース  艇4_展示タイム  艇5_コース  艇5_展示タイム  艇6_コース  艇6_展示タイム  
0       6  6.811533       5  6.934221       5  6.890248       5  6.859225  
1       6  6.887283       6  6.853288       5  6.874995       2  6.861927  
2       1  6.791930       5  6.854584       2  6.859013       1  6.857827  


## CSV出力（日別）

In [20]:
# Save predictions by date
output_dir = repo_root / 'data' / 'prediction-preview' / '2026' / '01'
output_dir.mkdir(parents=True, exist_ok=True)

# Group by date
for date_str in test_programs['レース日'].unique():
    if pd.isna(date_str):
        continue
    
    date_programs = test_programs[test_programs['レース日'] == date_str]
    
    # Create date key from race code
    race_codes = date_programs['レースコード'].unique()
    if len(race_codes) > 0:
        first_race_code = str(race_codes[0])
        if len(first_race_code) >= 8:
            year = first_race_code[:4]
            month = first_race_code[4:6]
            day = first_race_code[6:8]
            
            # Filter output for this date
            date_output = output_df[output_df['レース日'] == date_str]
            
            # Save
            output_path = output_dir / f'{day}.csv'
            date_output.to_csv(output_path, index=False)
            print(f'✓ Saved {output_path} ({len(date_output)} races)')

print('\nAll done!')

✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/01.csv (154 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/02.csv (189 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/03.csv (213 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/04.csv (228 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/05.csv (180 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/06.csv (180 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/07.csv (180 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/08.csv (165 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/09.csv (144 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026/01/10.csv (144 races)
✓ Saved /Users/mahiguch/dev/boatrace/data/data/prediction-preview/2026