# インポート

In [3]:
import gc

import pandas as pd

import create_population
import preprocessing
from feature_producing import FeatureCreator
%load_ext autoreload

In [32]:
# モジュールを更新した場合、以下を実行することで更新を反映させる
%autoreload

# 学習母集団の作成

In [4]:
population = create_population.create(
    from_="2019-01-01",
    to_="2024-12-31",
    output_filename="population.csv",
)

# データ前処理

## レース結果テーブル

In [5]:
results_preprocessed = preprocessing.process_results(output_filename="results_19-24.csv")
# 重複チェック
assert results_preprocessed.duplicated(subset=["race_id", "horse_id"]).sum() == 0
# 欠損チェック
results_preprocessed.isnull().sum()

race_id        0
horse_id       0
jockey_id      0
trainer_id     0
owner_id       0
rank           0
umaban         0
wakuban        0
tansyo_odds    0
popularity     0
impost         0
sex            0
age            0
weight         0
weight_diff    0
dtype: int64

In [6]:
# メモリ解放
del results_preprocessed
gc.collect()

26

## 馬の過去成績テーブル

In [7]:
# placeで欠損が多いのは地方競馬の開催場所をマッピングしていないため
horse_results_preprocessed = preprocessing.process_horse_results(output_filename="horse_results_19-24.csv")
# 重複チェック
assert horse_results_preprocessed.duplicated(subset=["horse_id", "date"]).sum() == 0
# 欠損チェック
horse_results_preprocessed.isnull().sum()

horse_id             0
date                 0
rank                 0
prize                0
rank_diff          558
weather            485
race_type            0
course_len           0
ground_state         5
race_class      307630
n_horses             1
time               337
win                  0
rentai               0
show                 0
place           331666
dtype: int64

In [8]:
# メモリ解放
del horse_results_preprocessed
gc.collect()

226

## レース情報テーブル

In [33]:
race_info_preprocessed = preprocessing.process_race_info(output_filename="race_info_19-24.csv")
# 重複チェック
assert race_info_preprocessed.duplicated(subset=["race_id"]).sum() == 0
# 欠損チェック
race_info_preprocessed.isnull().sum()

race_id            0
date               0
race_type          0
around           883
course_len       158
weather            0
ground_state    4485
race_class       588
place              0
month              0
sin_date           0
cos_date           0
dtype: int64

In [34]:
# 年ごとのレース数
race_info_preprocessed["race_id"].astype(str).str[:4].value_counts().sort_index()

race_id
2019    3452
2020    3456
2021    3456
2022    3456
2023    3456
2024    3403
Name: count, dtype: int64

In [35]:
# メモリ解放
del race_info_preprocessed
gc.collect()

126

## 騎手リーディングテーブル

In [36]:
jockey_leading_preprocessed = preprocessing.process_jockey_leading()
# 重複チェック
# assert jockey_leading_preprocessed.duplicated(subset=["jockey_id", "year"]).sum() == 0
# 欠損チェック
jockey_leading_preprocessed.isnull().sum()

jockey_id            0
year                 0
rank                 0
n_races              0
n_races_graded       0
winrate_graded     416
n_races_special      0
winrate_special     75
n_races_ordinal      0
winrate_ordinal    152
n_races_turf         0
winrate_turf       140
n_races_dirt         0
winrate_dirt       180
winrate              0
placerate            0
showrate             0
prize                0
dtype: int64

In [37]:
# 年ごとのデータ数
jockey_leading_preprocessed["year"].value_counts().sort_index()

year
2017    153
2018    197
2019    194
2020    185
2021    155
2022    196
2023    198
2024    200
Name: count, dtype: int64

In [38]:
# メモリ解放
del jockey_leading_preprocessed
gc.collect()

19

## 調教師リーディングテーブル

In [39]:
trainer_leading_preprocessed = preprocessing.process_trainer_leading()
# 重複チェック
# assert trainer_leading_preprocessed.duplicated(subset=["trainer_id", "year"]).sum() == 0
# 欠損チェック
trainer_leading_preprocessed.isnull().sum()

trainer_id           0
year                 0
rank                 0
n_races              0
n_races_graded       0
winrate_graded     372
n_races_special      0
winrate_special    130
n_races_ordinal      0
winrate_ordinal    287
n_races_turf         0
winrate_turf        46
n_races_dirt         0
winrate_dirt       287
winrate              0
placerate            0
showrate             0
prize                0
dtype: int64

In [40]:
# 年ごとのデータ数
trainer_leading_preprocessed["year"].value_counts().sort_index()

year
2014    250
2017    250
2018    202
2019    231
2020    229
2021    227
2022    230
2023    234
2024    227
Name: count, dtype: int64

In [41]:
# メモリ解放
del trainer_leading_preprocessed
gc.collect()

19

## 種牡馬リーディングテーブル

In [42]:
sire_leading_preprocessed = preprocessing.process_sire_leading()
# 重複チェック
# assert sire_leading_preprocessed.duplicated(
#     subset=["sire_id", "year", "race_type"]
# ).sum() == 0
# 欠損チェック
sire_leading_preprocessed.isnull().sum()

category
page_id          0
sire_id          0
year             0
race_type        0
n_races          0
n_wins           0
course_len       0
winrate       1274
dtype: int64

In [43]:
# 年ごとのデータ数
sire_leading_preprocessed["year"].value_counts().sort_index()

year
2017     688
2018     800
2019     800
2020     894
2021     800
2022     874
2023     888
2024    1000
Name: count, dtype: int64

In [44]:
# メモリ解放
del sire_leading_preprocessed
gc.collect()

54

## 血統テーブル

In [47]:
peds_preprocessed = preprocessing.process_peds()
# 重複チェック
# assert peds_preprocessed.duplicated(subset=["horse_id"]).sum() == 0
# 欠損チェック
peds_preprocessed.isnull().sum()

horse_id    0
sire_id     0
bms_id      0
dtype: int64

In [49]:
# メモリ解放
del peds_preprocessed
gc.collect()

1851

## 払い戻しテーブル

In [33]:
return_tables_preprocessed = preprocessing.process_return_tables()
# 欠損チェック
return_tables_preprocessed.isnull().sum()

AttributeError: Can only use .str accessor with string values!

#  特徴量作成

In [50]:
fc = FeatureCreator()

In [None]:
# sire_idの形式が異なるため、pedsのsire_idとsire_leadingのsire_idを確認
fc.peds

Unnamed: 0,horse_id,sire_id,bms_id
0,2006106754,1999106689,000a000013
1,2007100107,1999107004,1989109110
2,2007100828,000a00fa34,1977103827
3,2007106312,1990109700,000a00035e
4,2008100175,1999106756,000a001bda
...,...,...,...
35421,2021110155,000a01401e,000a010545
35422,2021110156,000a01316b,000a011248
35423,2021110157,000a015fd6,000a002309
35424,2021110160,000a013801,000a01056f


In [84]:
fc.sire_leading

Unnamed: 0,page_id,sire_id,year,race_type,n_races,n_wins,course_len,winrate
0,2017_01,000a00fa34,2017,0,208.0,10.0,1165.0,0.048077
1,2017_01,000a00fa34,2017,1,124.0,9.0,1511.1,0.072581
2,2017_01,000a00fe9b,2017,0,339.0,22.0,1479.6,0.064897
3,2017_01,000a00fe9b,2017,1,56.0,2.0,1500.0,0.035714
4,2017_01,000a0103b9,2017,0,170.0,12.0,1400.0,0.070588
...,...,...,...,...,...,...,...,...
6191,2023_09,2013109168,2023,1,0.0,0.0,0.0,
6192,2023_09,2014105108,2023,0,2.0,0.0,0.0,0.000000
6193,2023_09,2014105108,2023,1,1.0,0.0,0.0,0.000000
6194,2023_09,2014106077,2023,0,3.0,0.0,0.0,0.000000


In [None]:
# メモリ解放
del fc
gc.collect()

5946

In [7]:
fc = FeatureCreator(
    results_filename="results_19-24.csv",
    race_info_filename="race_info_19-24.csv",
    horse_results_filename="horse_results_19-24.csv")
features = fc.create_features()
# 重複チェック
assert features.duplicated(subset=["race_id", "horse_id"]).sum() == 0

agg_horse_n_races_relative:   0%|          | 0/5 [00:00<?, ?it/s]

running agg_interval()...
running agg_jockey()...
running agg_trainer()...


agg_horse_per_course_len:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_ground_state_race_type:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_race_class:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_race_type:   0%|          | 0/6 [00:00<?, ?it/s]

running agg_sire()...
running cross_feature()...
merging all features...


In [8]:
# 主要な特徴量について、年ごとの欠損率を確認しておく
features["year"] = pd.to_datetime(features["date"]).dt.year
features.groupby("year")[
    [
        "weight",
        "race_class",
        "rank_3races",
        "jockey_rank_relative",
        "trainer_rank_relative",
        "time_mean_10races_per_course_len_relative",
        "sire_n_races_relative",
    ]
].apply(lambda x: x.isnull().sum())

Unnamed: 0_level_0,weight,race_class,rank_3races,jockey_rank_relative,trainer_rank_relative,time_mean_10races_per_course_len_relative,sire_n_races_relative
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019,0,7798,16152,4023,13846,27723,3494
2020,0,0,10573,1662,690,23907,3373
2021,0,0,6845,3076,1444,20559,2965
2022,0,0,5175,3050,807,18961,2921
2023,0,0,4708,2286,651,18636,2755
2024,0,0,14027,3092,1391,22667,2926


In [9]:
features["impost"]

0         54.0
1         54.0
2         51.0
3         51.0
4         54.0
          ... 
282705    58.0
282706    54.0
282707    53.0
282708    50.0
282709    53.0
Name: impost, Length: 282710, dtype: float64

In [10]:
# メモリ解放
del features
gc.collect()

0

続いて、以下のnotebookを実行する
- オフライン検証：`offline_evaluation.ipynb`
- 本番運用時のモデル学習：`train.ipynb`
- 本番運用時の予測：`prediction.ipynb`