# データ加工以降の流れはこのファイルで完了する

# インポート

In [1]:
import gc

import pandas as pd
from pathlib import Path
import create_population
import preprocessing
from feature_producing import FeatureCreator
%load_ext autoreload

In [8]:
# モジュールを更新した場合、以下を実行することで更新を反映させる
%autoreload

# 学習母集団の作成

In [2]:
population = create_population.create(
    from_="2008-01-01",
    to_="2024-12-31",
    race_info_filename="race_info08-24.csv",
    results_filename="results08-24.csv",
    output_filename="population.csv",
)

# データ前処理

## レース結果テーブル

In [3]:
results_preprocessed = preprocessing.process_results(
    input_filename="results08-24.csv",
    output_filename="results08-24.csv"
)
# 重複チェック
assert results_preprocessed.duplicated(subset=["race_id", "horse_id"]).sum() == 0
# 欠損チェック
results_preprocessed.isnull().sum()

race_id        0
horse_id       0
jockey_id      0
trainer_id     0
rank           0
umaban         0
wakuban        0
tansho_odds    0
popularity     0
impost         0
sex            0
age            0
weight         1
weight_diff    0
n_horses       0
time           0
time_rank      0
dtype: int64

In [4]:
# 年ごとのレース数
results_preprocessed["race_id"].astype(str).str[:4].value_counts().sort_index()

race_id
2008    49872
2009    50032
2010    49568
2011    48726
2012    49472
2013    49638
2014    49877
2015    49609
2016    49697
2017    48922
2018    48185
2019    47118
2020    47876
2021    47476
2022    46840
2023    47273
2024    46643
Name: count, dtype: int64

In [5]:
# メモリ解放
del results_preprocessed
gc.collect()

340

## 馬の過去成績テーブル

In [9]:
# placeで欠損が多いのは地方競馬の開催場所をマッピングしていないため
horse_results_preprocessed = preprocessing.process_horse_results(
    input_filename="horse_results08-24.csv",
    output_filename="horse_results08-24.csv"
    )
# 重複チェック
assert horse_results_preprocessed.duplicated(subset=["horse_id", "date"]).sum() == 0
# 欠損チェック
horse_results_preprocessed.isnull().sum()

horse_id              0
date                  0
rank                  0
prize                 0
rank_diff          2358
weather            1919
race_type            59
course_len           59
ground_state         72
race_class       966820
n_horses            522
time                942
win                   0
rentai                0
show                  0
place           1150512
dtype: int64

In [10]:
# メモリ解放
del horse_results_preprocessed
gc.collect()

182

## レース情報テーブル

In [11]:
race_info_preprocessed = preprocessing.process_race_info(
    input_filename="race_info08-24.csv",
    output_filename="race_info08-24.csv"
)
# 重複チェック
assert race_info_preprocessed.duplicated(subset=["race_id"]).sum() == 0
# 欠損チェック
race_info_preprocessed.isnull().sum()

race_id            0
date               0
race_type          0
around          2164
course_len       358
weather            0
ground_state       0
race_class         0
place              0
month              0
sin_date           0
cos_date           0
dtype: int64

In [12]:
# 年ごとのレース数
race_info_preprocessed["race_id"].astype(str).str[:4].value_counts().sort_index()

race_id
2008    3452
2009    3453
2010    3454
2011    3453
2012    3454
2013    3454
2014    3451
2015    3454
2016    3454
2017    3455
2018    3453
2019    3452
2020    3456
2021    3456
2022    3456
2023    3456
2024    3442
Name: count, dtype: int64

In [13]:
# メモリ解放
del race_info_preprocessed
gc.collect()

136

## 騎手リーディングテーブル

In [25]:
jockey_leading_preprocessed = preprocessing.process_jockey_leading()
# 重複チェック
# assert jockey_leading_preprocessed.duplicated(subset=["jockey_id", "year"]).sum() == 0
# 欠損チェック
jockey_leading_preprocessed.isnull().sum()

jockey_id             0
year                  0
rank                  0
n_races               0
n_races_graded        0
winrate_graded     1038
n_races_special       0
winrate_special     234
n_races_ordinal       0
winrate_ordinal     314
n_races_turf          0
winrate_turf        292
n_races_dirt          0
winrate_dirt        433
winrate               0
placerate             0
showrate              0
prize                 0
dtype: int64

In [26]:
# 年ごとのデータ数
jockey_leading_preprocessed["year"].value_counts().sort_index()

year
2007    247
2008    247
2009    242
2010    248
2011    223
2012    154
2013    180
2014    144
2015    184
2016    137
2017    153
2018    147
2019    144
2020    185
2021    155
2022    196
2023    198
2024    200
Name: count, dtype: int64

In [27]:
# メモリ解放
del jockey_leading_preprocessed
gc.collect()

19

## 調教師リーディングテーブル

In [28]:
trainer_leading_preprocessed = preprocessing.process_trainer_leading()
# 重複チェック
# assert trainer_leading_preprocessed.duplicated(subset=["trainer_id", "year"]).sum() == 0
# 欠損チェック
trainer_leading_preprocessed.isnull().sum()

trainer_id           0
year                 0
rank                 0
n_races              0
n_races_graded       0
winrate_graded     938
n_races_special      0
winrate_special    375
n_races_ordinal      0
winrate_ordinal    580
n_races_turf         0
winrate_turf       157
n_races_dirt         0
winrate_dirt       645
winrate              0
placerate            0
showrate             0
prize                0
dtype: int64

In [29]:
# 年ごとのデータ数
trainer_leading_preprocessed["year"].value_counts().sort_index()

year
2007    330
2008    283
2009    319
2010    275
2011    308
2012    231
2013    252
2014    250
2017    250
2018    152
2019    231
2020    229
2021    227
2022    230
2023    234
2024    227
Name: count, dtype: int64

In [30]:
# メモリ解放
del trainer_leading_preprocessed
gc.collect()

19

## 種牡馬リーディングテーブル

In [31]:
sire_leading_preprocessed = preprocessing.process_sire_leading()
# 重複チェック
# assert sire_leading_preprocessed.duplicated(
#     subset=["sire_id", "year", "race_type"]
# ).sum() == 0
# 欠損チェック
sire_leading_preprocessed.isnull().sum()

category
page_id          0
sire_id          0
year             0
race_type        0
n_races          0
n_wins           0
course_len       0
winrate       2180
dtype: int64

In [32]:
# 年ごとのデータ数
sire_leading_preprocessed["year"].value_counts().sort_index()

year
2007    1000
2008     900
2009    1000
2010    1000
2011    1000
2012     976
2013     800
2014      14
2015      48
2016      26
2017     788
2018     800
2019     800
2020     894
2021     774
2022     802
2023     888
2024    1000
Name: count, dtype: int64

In [33]:
# メモリ解放
del sire_leading_preprocessed
gc.collect()

54

## 血統テーブル

In [34]:
peds_preprocessed = preprocessing.process_peds(
    input_filename="peds08-24.csv",
    output_filename="peds08-24.csv"
)
# 重複チェック
# assert peds_preprocessed.duplicated(subset=["horse_id"]).sum() == 0
# 欠損チェック
peds_preprocessed.isnull().sum()

horse_id    0
sire_id     0
bms_id      0
dtype: int64

In [35]:
# メモリ解放
del peds_preprocessed
gc.collect()

0

## 払い戻しテーブル

In [None]:
# 払い戻しテーブルは学習実行のプロセスでは使用しない。上位n頭のかけ方をシミュレーションする際に必要
return_tables_preprocessed = preprocessing.process_return_tables()
# 欠損チェック
return_tables_preprocessed.isnull().sum()

AttributeError: Can only use .str accessor with string values!

#  特徴量作成

In [3]:
%autoreload

In [2]:
fc = FeatureCreator(
    results_filename="results08-24.csv",
    race_info_filename="race_info08-24.csv",
    horse_results_filename="horse_results08-24.csv",
    peds_filename="peds08-24.csv",
    output_filename="features08-24.csv",)

features = fc.create_features()
# 重複チェック
assert features.duplicated(subset=["race_id", "horse_id"]).sum() == 0
features

Processing population chunks:   0%|          | 0/84 [00:00<?, ?it/s]

running create_baselog()...


agg_horse_n_races_relative:   0%|          | 0/5 [00:00<?, ?it/s]

running agg_interval()...
running agg_jockey()...
running agg_trainer()...


agg_horse_per_course_len:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_ground_state_race_type:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_race_class:   0%|          | 0/6 [00:00<?, ?it/s]

: 

In [11]:
chunks = pd.read_csv(Path("..", "data","02_features","features12-24.csv"), sep="\t", chunksize=10000)  # チャンクで読み込み
features = pd.concat(chunks) 
features

Unnamed: 0,race_id,date,horse_id,jockey_id,trainer_id,rank,umaban,wakuban,tansho_odds,popularity,...,sire_course_len_diff_relative,date_horse,interval,wakuban_race_type,umaban_race_type,wakuban_around,umaban_around,month_sex,sin_date_sex,cos_date_sex
0,201801010101,2018-07-28,2016104880,5339,386,1,1,1,1.1,1,...,,,,-1.0,-1.0,,,7.0,0.561693,0.101175
1,201801010101,2018-07-28,2016101840,5203,1148,2,6,6,7.6,2,...,,,,-6.0,-6.0,,,-7.0,-0.561693,-0.101175
2,201801010101,2018-07-28,2016105057,1127,1132,3,3,3,8.3,3,...,,,,-3.0,-3.0,,,-7.0,-0.561693,-0.101175
3,201801010101,2018-07-28,2016103985,1085,434,4,4,4,46.7,5,...,,,,-4.0,-4.0,,,-7.0,-0.561693,-0.101175
4,201801010101,2018-07-28,2016103821,1170,417,5,5,5,39.8,4,...,,,,-5.0,-5.0,,,-7.0,-0.561693,-0.101175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628105,201409050812,2014-12-28,2007102676,1018,1071,12,12,6,22.8,8,...,,2014-11-15,43.0,6.0,12.0,,,-12.0,-0.944121,-1.998438
628106,201409050812,2014-12-28,2010101256,5383,427,13,8,4,7.5,4,...,,2014-11-16,42.0,4.0,8.0,,,-12.0,-0.944121,-1.998438
628107,201409050812,2014-12-28,2008101935,1093,378,14,5,3,49.9,12,...,,2014-11-23,35.0,3.0,5.0,,,-12.0,-0.944121,-1.998438
628108,201409050812,2014-12-28,2009100970,1037,1032,15,14,7,7.3,3,...,,2014-11-30,28.0,7.0,14.0,,,-12.0,-0.944121,-1.998438


In [None]:
# 主要な特徴量について、年ごとの欠損率を確認しておく
features["year"] = pd.to_datetime(features["date"]).dt.year
features.groupby("year")[
    [   
        "place",
        "weight",
        "race_class",
        "rank_3races",
        'rank_1000races',
        "jockey_rank_relative",
        "trainer_rank_relative",
        "time_mean_10races_per_course_len_relative",
        "sire_n_races_relative",
        "rank_diff_mean_2races_relative",
        "rank_diff_mean_3races_relative",
        "rank_diff_mean_1000races_relative"
    ]
].apply(lambda x: x.isnull().sum())

Unnamed: 0_level_0,place,weight,race_class,rank_3races,rank_1000races,jockey_rank_relative,trainer_rank_relative,time_mean_10races_per_course_len_relative,sire_n_races_relative,rank_diff_mean_2races_relative,rank_diff_mean_3races_relative,rank_diff_mean_1000races_relative
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012,0,0,0,4572,4572,4903,773,19632,49472,4582,4580,4579
2013,0,1,0,4478,4478,17892,15241,19276,49638,4490,4485,4483
2014,0,0,0,4522,4522,3554,1200,19477,49877,4531,4528,4525
2015,0,0,0,4446,4446,46992,1253,19072,2786,4458,4454,4453
2016,0,0,0,4524,4524,3327,49697,19398,3050,4534,4530,4526
2017,0,0,0,4538,4538,32321,48922,19004,12115,4548,4546,4543
2018,0,0,0,8766,8766,18464,1159,20891,34703,8803,8796,8796
2019,0,0,0,8400,8400,4023,13846,21265,3494,8406,8402,8400
2020,0,0,0,6556,6556,5487,690,20538,3373,6559,6558,6557
2021,0,0,0,5413,5413,3076,1444,19387,2965,5418,5416,5416


: 

In [4]:
# メモリ解放
del features
gc.collect()

0

続いて、以下のnotebookを実行する
- オフライン検証：`offline_evaluation.ipynb`
- 本番運用時のモデル学習：`train.ipynb`
- 本番運用時の予測：`prediction.ipynb`