# インポート

In [36]:
# import numpy as np
# import lightgbm as lgb
import pandas as pd
from pathlib import Path
import create_population
import preprocessing
from feature_producing import FeatureCreator
from training import Trainer
from evaluationing import Evaluator
%load_ext autoreload


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
%autoreload

# 学習母集団の作成
    ・検証母集団:2023年の全レース
    ・学習母集団:以下を比較
        ・2022年の1年間
        ・2020~2022年の3年間
        ・2018~2022年の5年間

In [9]:
population = create_population.create(
    from_ ="2018-01-01",
    to_ = "2023-12-31",
    output_filename="population_5year.csv"
)

In [None]:
# 各年の数を確認
population["date"].dt.year.value_counts().sort_index()

date
2018    48604
2019    47574
2020    48282
2021    47821
2022    47220
2023    47672
Name: count, dtype: int64

In [67]:
population["race_id"]

0        202301010101
1        202301010101
2        202301010101
3        202301010101
4        202301010101
             ...     
47667    202310030812
47668    202310030812
47669    202310030812
47670    202310030812
47671    202310030812
Name: race_id, Length: 47672, dtype: int64

In [68]:
population

Unnamed: 0,race_id,date,horse_id
0,202301010101,2023-07-22,2021101429
1,202301010101,2023-07-22,2021105872
2,202301010101,2023-07-22,2021106854
3,202301010101,2023-07-22,2021105553
4,202301010101,2023-07-22,2021100648
...,...,...,...
47667,202310030812,2023-09-03,2020105644
47668,202310030812,2023-09-03,2018106584
47669,202310030812,2023-09-03,2020101781
47670,202310030812,2023-09-03,2019106647


# データ加工

In [14]:
import preprocessing
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
%autoreload

In [None]:
# レース結果テーブルの前処理

In [70]:
results = pd.read_csv(RAWDF_DIR / "results.csv", sep = "\t")
results["race_id"]

0        202301010101
1        202301010101
2        202301010101
3        202301010101
4        202301010101
             ...     
47746    202205010305
47747    202205010305
47748    202205010305
47749    202205010305
47750    202205010305
Name: race_id, Length: 47751, dtype: int64

In [48]:
# Ensure the data types are correctly set
results_preprocessd = results_preprocessd.astype({
	'race_id': 'int64',
	'horse_id': 'int64',
	'jockey_id': 'int64',
	'trainer_id': 'int64',
	'owner_id': 'int64',
	'rank': 'int32',
	'umaban': 'int32',
	'wakuban': 'int32',
	'tansyo_odds': 'float64',
	'popularity': 'int32',
	'kinryou': 'float64',
	'sex': 'int64',
	'age': 'int32',
	'weight': 'int32',
	'weight_diff': 'int32'
})

In [58]:
COMMON_DATA_DIR = Path("..","..","common","data")
RAWDF_DIR = COMMON_DATA_DIR/"rawdf"
input_dir: Path = RAWDF_DIR
results = pd.read_csv(input_dir / "results.csv", sep="\t")
# df = pd.read_csv(input_dir / "results.csv", sep="\t").query("race_id in @population['race_id']")
results["race_id"]

0        202301010101
1        202301010101
2        202301010101
3        202301010101
4        202301010101
             ...     
47746    202205010305
47747    202205010305
47748    202205010305
47749    202205010305
47750    202205010305
Name: race_id, Length: 47751, dtype: int64

In [None]:
n_years_list = [1, 3, 5]

In [None]:
#レース結果テーブルの前処理
for n_years in n_years_list:
    results_preprocessed = preprocessing.process_results(
        population_filename=f"population_{n_years}year.csv", 
        output_filename=f"results_preprocessed_{n_years}year.csv",
)

In [13]:
# 重複チェック
results_preprocessd.duplicated(subset=["race_id", "horse_id"]).sum()

np.int64(0)

In [76]:
# 欠損チェック
results_preprocessd.isnull().sum()

race_id        0
horse_id       0
jockey_id      0
trainer_id     0
owner_id       0
rank           0
umaban         0
wakuban        0
tansyo_odds    0
popularity     0
kinryou        0
sex            0
age            0
weight         0
weight_diff    0
dtype: int64

In [None]:
# 馬の過去成績テーブルの前処理
n_years_list = [1, 3, 5]
for n in n_years_list:
    print(n)
    horse_results_process = preprocessing.process_horse_results(
        POPULATION_FILENAME=f"population_{n}year.csv",
        output_filename=f"horse_results_{n}year.csv"
    )


In [81]:
horse_results_preprocessd

Unnamed: 0,horse_id,date,rank,prize,rank_diff,weather,race_type,course_len,ground_condition,race_class,n_horses
0,2021101429,2024-11-09,9.0,0.0,0.2,0.0,1,1400,0.0,4.0,14
1,2021101429,2024-10-06,1.0,1140.0,0.0,1.0,1,1400,0.0,3.0,13
2,2021101429,2024-09-01,2.0,625.2,0.4,0.0,1,1400,0.0,5.0,7
3,2021101429,2024-08-10,2.0,629.2,0.0,0.0,1,1400,0.0,5.0,14
4,2021101429,2024-04-06,13.0,0.0,0.9,1.0,1,1600,1.0,7.0,16
...,...,...,...,...,...,...,...,...,...,...,...
213811,2019103076,2022-01-10,8.0,0.0,0.7,1.0,1,2000,0.0,1.0,17
213812,2019103076,2021-12-12,5.0,51.0,0.2,1.0,1,2000,0.0,1.0,17
213813,2019103076,2021-11-20,12.0,0.0,2.0,0.0,1,1200,0.0,1.0,16
213814,2019103076,2021-07-18,7.0,0.0,1.1,0.0,1,1200,0.0,1.0,12


In [82]:
horse_results_preprocessd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 210604 entries, 0 to 213815
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   horse_id          210604 non-null  int64         
 1   date              210604 non-null  datetime64[ns]
 2   rank              210604 non-null  float64       
 3   prize             210604 non-null  float64       
 4   rank_diff         210344 non-null  float64       
 5   weather           210385 non-null  float64       
 6   race_type         210604 non-null  int64         
 7   course_len        210604 non-null  int32         
 8   ground_condition  210600 non-null  float64       
 9   race_class        150061 non-null  float64       
 10  n_horses          210604 non-null  int64         
dtypes: datetime64[ns](1), float64(6), int32(1), int64(3)
memory usage: 18.5 MB


In [83]:
# 欠損チェック
horse_results_preprocessd.isnull().sum()

horse_id                0
date                    0
rank                    0
prize                   0
rank_diff             260
weather               219
race_type               0
course_len              0
ground_condition        4
race_class          60543
n_horses                0
dtype: int64

In [10]:
%autoreload

In [27]:
n_years_list = [1, 3, 5]
for n in n_years_list:
    race_info_preprocess  = preprocessing.process_race_info(
        POPULATION_FILENAME=f"population_{n}year.csv",
        output_filename=f"race_info_{n}year.csv"
    )


In [28]:
race_info_preprocess["date"].dt.year.value_counts().sort_index()

date
2018    3453
2019    3452
2020    3456
2021    3456
2022    3456
2023    3456
Name: count, dtype: int64

# 特徴量作成

In [21]:
from feature_producing import FeatureCreator
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
%autoreload

In [13]:
n_years_list = [1, 3, 5]

In [24]:
for n in n_years_list:
    feature_creator = FeatureCreator(
        poplation_filename=f"population_{n}year.csv",
        results_filename=f"results_{n}year.csv",
        horse_results_filename=f"horse_results_{n}year.csv",
        race_info_filename=f"race_info_{n}year.csv",
        output_filename=f"features_{n}year.csv"
    )
    features = feature_creator.create_features()
        

In [17]:
fp = FeatureCreator()
features = fp.create_features()

In [25]:
# 重複チェック
features.duplicated(subset=["race_id", "horse_id"]).sum()

0

# 学習

In [34]:
%autoreload

In [94]:
trainer = Trainer()

In [35]:
for n in n_years_list:
    print(n)
    trainer = Trainer(features_filename=f"features_{n}year.csv",)
    evaluation_df = trainer.run(
        
        test_start_date="2023-01-01",
        model_filename=f"model_{n}year.pkl",
        evaluation_filename=f"evaluation_{n}year.csv"
    )


1
[100]	training's binary_logloss: 0.161505	valid_1's binary_logloss: 0.210218
3
[100]	training's binary_logloss: 0.189099	valid_1's binary_logloss: 0.206979
5
[100]	training's binary_logloss: 0.194671	valid_1's binary_logloss: 0.206691


# 精度評価

In [41]:
from evaluationing import Evaluator
import pandas as pd
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
%autoreload

In [None]:
evaluator = Evaluator()

In [None]:
# 1年間の学習結果の評価
evaluator = Evaluator(evaluation_filename="evaluation_1year.csv")
evals_1year = pd.concat([
    evaluator.summarize_box_top_n(n=1, exp_name="1year"),
    evaluator.summarize_box_top_n(n=2, exp_name="1year"),
    evaluator.summarize_box_top_n(n=3, exp_name="1year"),
]).set_index(["topn","bet_type"])
evals_1year

Unnamed: 0_level_0,Unnamed: 1_level_0,hitrate_pop,returnrate_pop,hitrate_1year,returnrate_1year
topn,bet_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,単勝,0.331597,0.795833,0.337095,0.825145
1,複勝,0.639468,0.843605,0.64265,0.856655
2,ワイド,0.307002,0.778096,0.300637,0.787095
2,単勝,0.528646,0.799552,0.532986,0.828545
2,複勝,0.845775,0.835446,0.849537,0.846499
2,馬単,0.150752,0.731091,0.146412,0.743417
2,馬連,0.150752,0.761227,0.146412,0.767882
3,ワイド,0.548611,0.792438,0.541377,0.80135
3,三連単,0.088542,0.658415,0.086806,0.666387
3,三連複,0.088542,0.778414,0.086806,0.777894


In [None]:
# 3年間の学習結果の評価
evaluator = Evaluator(evaluation_filename="evaluation_3year.csv")
evals_3year = pd.concat([
    evaluator.summarize_box_top_n(n=1, exp_name="3year"),
    evaluator.summarize_box_top_n(n=2, exp_name="3year"),
    evaluator.summarize_box_top_n(n=3, exp_name="3year"),
]).set_index(["topn","bet_type"])
evals_3year

Unnamed: 0_level_0,Unnamed: 1_level_0,hitrate_pop,returnrate_pop,hitrate_3year,returnrate_3year
topn,bet_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,単勝,0.331597,0.795833,0.328993,0.81441
1,複勝,0.639468,0.843605,0.636285,0.851447
2,ワイド,0.307002,0.778096,0.296296,0.797106
2,単勝,0.528646,0.799552,0.519965,0.802025
2,複勝,0.845775,0.835446,0.840856,0.838773
2,馬単,0.150752,0.731091,0.139178,0.727214
2,馬連,0.150752,0.761227,0.139178,0.755874
3,ワイド,0.548611,0.792438,0.541088,0.822917
3,三連単,0.088542,0.658415,0.08941,0.74484
3,三連複,0.088542,0.778414,0.08941,0.864381


In [None]:
# 5年間の学習結果の評価
evaluator = Evaluator(evaluation_filename="evaluation_5year.csv")
evals_5year = pd.concat([
    evaluator.summarize_box_top_n(n=1, exp_name="5year"),
    evaluator.summarize_box_top_n(n=2, exp_name="5year"),
    evaluator.summarize_box_top_n(n=3, exp_name="5year"),
]).set_index(["topn","bet_type"])
evals_5year

Unnamed: 0_level_0,Unnamed: 1_level_0,hitrate_pop,returnrate_pop,hitrate_5year,returnrate_5year
topn,bet_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,単勝,0.331597,0.795833,0.337095,0.825145
1,複勝,0.639468,0.843605,0.64265,0.856655
2,ワイド,0.307002,0.778096,0.300637,0.787095
2,単勝,0.528646,0.799552,0.532986,0.828545
2,複勝,0.845775,0.835446,0.849537,0.846499
2,馬単,0.150752,0.731091,0.146412,0.743417
2,馬連,0.150752,0.761227,0.146412,0.767882
3,ワイド,0.548611,0.792438,0.541377,0.80135
3,三連単,0.088542,0.658415,0.086806,0.666387
3,三連複,0.088542,0.778414,0.086806,0.777894


In [77]:
evals = pd.concat([evals_1year, evals_3year, evals_5year],axis=1).sort_index(axis=1)
evals

Unnamed: 0_level_0,Unnamed: 1_level_0,hitrate_1year,hitrate_3year,hitrate_5year,hitrate_pop,hitrate_pop,hitrate_pop,returnrate_1year,returnrate_3year,returnrate_5year,returnrate_pop,returnrate_pop,returnrate_pop
topn,bet_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,単勝,0.337095,0.328993,0.337095,0.331597,0.331597,0.331597,0.825145,0.81441,0.825145,0.795833,0.795833,0.795833
1,複勝,0.64265,0.636285,0.64265,0.639468,0.639468,0.639468,0.856655,0.851447,0.856655,0.843605,0.843605,0.843605
2,ワイド,0.300637,0.296296,0.300637,0.307002,0.307002,0.307002,0.787095,0.797106,0.787095,0.778096,0.778096,0.778096
2,単勝,0.532986,0.519965,0.532986,0.528646,0.528646,0.528646,0.828545,0.802025,0.828545,0.799552,0.799552,0.799552
2,複勝,0.849537,0.840856,0.849537,0.845775,0.845775,0.845775,0.846499,0.838773,0.846499,0.835446,0.835446,0.835446
2,馬単,0.146412,0.139178,0.146412,0.150752,0.150752,0.150752,0.743417,0.727214,0.743417,0.731091,0.731091,0.731091
2,馬連,0.146412,0.139178,0.146412,0.150752,0.150752,0.150752,0.767882,0.755874,0.767882,0.761227,0.761227,0.761227
3,ワイド,0.541377,0.541088,0.541377,0.548611,0.548611,0.548611,0.80135,0.822917,0.80135,0.792438,0.792438,0.792438
3,三連単,0.086806,0.08941,0.086806,0.088542,0.088542,0.088542,0.666387,0.74484,0.666387,0.658415,0.658415,0.658415
3,三連複,0.086806,0.08941,0.086806,0.088542,0.088542,0.088542,0.777894,0.864381,0.777894,0.778414,0.778414,0.778414
