# 1: 事前準備

## 1.1: インポート

In [2]:
from tqdm import tqdm
import sys
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from itertools import combinations
from itertools import permutations
from urllib.request import urlopen

## 1.2: pickleファイルの読み込み

In [3]:
# 全体のデータを取得する。
race_results=pd.read_pickle('pickle/overall/race_results.pickle')
horse_results=pd.read_pickle('pickle/overall/horse_results.pickle')
ped_results=pd.read_pickle('pickle/overall/ped_results.pickle')
return_tables=pd.read_pickle('pickle/overall/return_tables.pickle')
horse_id_list = race_results['horse_id'].unique()
# 2021年のデータを取得する。
race_results_2021 = pd.read_pickle('pickle/2021/race_results.pickle')
horse_results_2021=pd.read_pickle('pickle/2021/horse_results.pickle')
return_tables_2021=pd.read_pickle('pickle/2021/return_tables.pickle')
ped_results_2021=pd.read_pickle('pickle/2021/ped_results.pickle')
horse_id_list_2021 = race_results_2021['horse_id'].unique()
# 2020年のデータを取得する。
race_results_2020 = pd.read_pickle('pickle/2020/race_results.pickle')
horse_results_2020=pd.read_pickle('pickle/2020/horse_results.pickle')
return_tables_2020=pd.read_pickle('pickle/2020/return_tables.pickle')
ped_results_2020=pd.read_pickle('pickle/2020/ped_results.pickle')
horse_id_list_2020 = race_results_2020['horse_id'].unique()
# 2019年のデータを取得する。
race_results_2019=pd.read_pickle('pickle/2019/race_results.pickle')
horse_results_2019=pd.read_pickle('pickle/2019/horse_results.pickle')
return_tables_2019=pd.read_pickle('pickle/2019/return_tables.pickle')
ped_results_2019=pd.read_pickle('pickle/2019/ped_results.pickle')
horse_id_list_2019 = race_results_2019['horse_id'].unique()
# 2018年のデータを取得する。
race_results_2018=pd.read_pickle('pickle/2018/race_results.pickle')
horse_results_2018=pd.read_pickle('pickle/2018/horse_results.pickle')
return_tables_2018=pd.read_pickle('pickle/2018/return_tables.pickle')
ped_results_2018=pd.read_pickle('pickle/2018/ped_results.pickle')
horse_id_list_2018 = race_results_2018['horse_id'].unique()
# 2017年のデータを取得する。
race_results_2017=pd.read_pickle('pickle/2017/race_results.pickle')
horse_results_2017=pd.read_pickle('pickle/2017/horse_results.pickle')
return_tables_2017=pd.read_pickle('pickle/2017/return_tables.pickle')
ped_results_2017=pd.read_pickle('pickle/2017/ped_results.pickle')
horse_id_list_2017 = race_results_2017['horse_id'].unique()

## 1.3: 必要リスト

In [4]:
#開催場所をidに変換するための辞書型
place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

#レースタイプをレース結果データと整合させるための辞書型
race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}

# 2: クラス定義

In [5]:
# 基底クラス
class DataProcessor:

    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """


    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()

    # shutuba_tables_pに過去データを追記する。
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):

        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す
        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """

        self.data_h = self.data_p.copy()
        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
        
        # 馬の出走間隔
        self.data_h['interval'] = (self.data_h['date'] - self.data_h['latest']).dt.days
        self.data_h.drop(['開催', 'latest'], axis=1, inplace=True)

    # pedデータを追記する。
    def merge_ped_results(self, ped_results):

        """
        5世代分血統データを追加してdata_peに返す
        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """

        self.data_pe = self.data_h.merge(ped_results, left_on='horse_id', right_index=True, how='left')

        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('please scrape ped_results at horse_id_list "no_peds"')

    # カテゴリ変数の処理
    def process_categorical(self, le_horse, le_jockey, results_m):

        """
        カテゴリ変数を処理してdata_cに返す
        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """

        df = self.data_pe.copy()

        # ラベルエンコーディング: horse_id, jockey_idを0始まりの整数に変換
        # classes_: fit()によって各ラベルがどのラベルIDと対応づけられたのかを取得可能
        # mask関数: 引数リストの中身がTrueのところをNaNにし、Falseのところには実データを入れる
        # where関数: 引数リストの中身がTrueのところを実データにし、FalseのところにはNaNを入れる
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])

        # horse_id, jockey_idをpandasのcategory型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df['jockey_id'].astype('category')

        # その他のカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        # 列を一定にするため
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()

        # pd.Categorical関数: get_dummies関数にかける前に列を指定できる。要するに、全て0の列を作成可能。
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])

        self.data_c = df

# レースに関するクラス
class RaceResults(DataProcessor):

    def __init__(self, race_results):
        super(RaceResults, self).__init__()
        self.data = race_results

    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)

    @staticmethod
    def scrape(race_id_list, pre_race_results=pd.DataFrame()):

        """
        レース結果データをスクレイピングする関数
        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト
        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """

        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            R = int(race_id) % 100
            day = (int(race_id) % 10000 - R)//100
            kai = (int(race_id) % 1000000  - R - 100 * day) // 10000
            place = (int(race_id) % 100000000  - R - 100 * day  - 10000 * kai) // 1000000
            # print(R+12*(day-1)+144*(kai-1)+1728*(place-1))
            # if R+12*(day-1)+144*(kai-1)+1728*(place-1) == 10000:
            #   break
            if len(pre_race_results) and int(race_id) <= int(pre_race_results.index[-1]):
                continue
            try:
                time.sleep(1)
                url = "https://db.netkeiba.com/race/" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]
                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")
                #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[0])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)
                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list
                #インデックスをrace_idにする
                df.index = [race_id] * len(df)
                race_results[race_id] = df
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break
        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])
        if len(pre_race_results.index):
            return pd.concat([pre_race_results, race_results_df])
        else:
            return race_results_df

    # 前処理
    def preprocessing(self):
        df = self.data.copy()
        # 着順に数字以外のものが含まれているデータを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x: 1 if x<4 else 0)
        # 性齢を性と年齢に分割
        df['性'] = df['性齢'].map(lambda x: str(x)[0])
        df['年齢'] = df['性齢'].map(lambda x: str(x)[1]).astype(int)
        # 馬体重を現体重と増減に分割
        # expand:Falseなら1列に分割後の配列が格納される
        df['体重'] = df['馬体重'].str.split("(", expand=True)[0].astype(int)
        df['体重変化'] = df['馬体重'].str.split("(", expand=True)[1].str[:-1].astype(int)
        df['単勝'] = df['単勝'].astype(float)
        df["course_len"] = df["course_len"].astype(float) // 100
        df['date'] = pd.to_datetime(df['date'], format='%Y年%m月%d日')
        # 開催地
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        # 出走数
        df['n_horses'] = df.index.map(df.index.value_counts())
        # 不要な列を削除する
        # inplace:dfを置き換える
        df.drop(['タイム', '着差', '調教師', '性齢', '馬体重', '馬名', '騎手', '人気', '着順'], axis='columns', inplace=True)
        self.data_p = df
    def to_rank(self):
        df = self.data.copy()
        df['rank'] = df['着順'].map(lambda x: x if x<4 else 4)
        df.drop(['着順'], axis='columns', inplace=True)
        self.data = df
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
        super().process_categorical(self.le_horse, self.le_jockey, self.data_pe)
# 馬に関するクラス
class HorseResults:

    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過', '開催', '距離']]
        self.preprocessing()
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    @staticmethod
    def scrape(horse_id_list, pre_horse_results=pd.DataFrame()):
        """
        馬の過去成績データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """
        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}

        for horse_id in tqdm(horse_id_list):
            if horse_id in pre_horse_results.index.unique():
                continue
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])
        if len(pre_horse_results):
            return pd.concat([pre_horse_results, horse_results_df]) 
        else:
            return horse_results_df

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外のものが含まれているデータを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df['date'] = pd.to_datetime(df['日付'])
        df.drop(['日付'], axis='columns', inplace=True)

        # 賞金=NaNを0で埋める
        df['賞金'].fillna(0, inplace=True)

        # 1着の着差を0とする。(元データには2位との着差を負の値で格納されている)
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)

        # レース展開データ
        # n=1: 最初のコーナーの位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n == 4:
                return int(re.findall(r'\d+', x)[-1])
            elif n == 1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))

        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']

        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')

        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)

        #距離は10のくらいで切り捨てる。
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)

        #インデックス名を与える
        df.index.name = 'horse_id'

        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner', 'final_corner', 'first_to_rank', 'first_to_final', 'final_to_rank']
    # 日付ごとの着順と賞金の平均値を算出する関数
    def average(self, horse_id_list, date, n_samples='all'):
        # query関数: pd.DataFrameのデータの内、条件を満たす物だけを抽出する。
        #            @は変数名を利用したいときに先頭につける。
        #            これで、horse_results内に無いデータにも対応可能。(但し、格納されているデータは欠損値)
        target_df = self.horse_results.query('index in @horse_id_list')

        # 過去何回分の平均を取り出すか設定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            # ある日付より前の日付という条件の下、n_samples個のデータのみを抽出して平均値を出す。
            filtered_df = target_df[target_df['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_sample must be "all" or plus number')
    
        self.average_dict = {}
            # add_suffix関数: 列名の最後に引数の文字列を追加する関数。
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean().add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])[self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))
    
        # 馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            self.latest = filtered_df.groupby('horse_id')['date'].max().rename('latest')

    # dateをkeyにしてdfを結合する関数
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id', right_index=True, how='left')
        for column in ['course_len', 'race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column],
                                        left_on=['horse_id', column],
                                        right_index=True,
                                        how='left')
        # 馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            merged_df = merged_df.merge(self.latest, left_on='horse_id', right_index=True, how='left')
        return merged_df

    # 上記の操作を全日付に対して行う関数
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df
# 血統データを処理するクラス
class Peds:
    def __init__(self, ped_results):
        self.ped_results = ped_results
        self.ped_results_e = pd.DataFrame() # after label encoding and transforming into category

    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list, pre_ped_results=pd.DataFrame()):
        """
        血統データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            if horse_id in pre_ped_results.index.unique():
                continue
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                df.drop([i], axis='columns', inplace=True)
                df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        if len(pre_ped_results):
            return pd.concat([pre_ped_results, peds_df]) 
        else:
            return peds_df
    
    def encode(self):
        df = self.ped_results.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.ped_results_e = df.astype('category')

class Return:
    def __init__(self, return_tables):
        self.return_tables = return_tables
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    @staticmethod
    def scrape(race_id_list, pre_race_tables=pd.DataFrame()):
        #race_idをkeyにしてDataFrame型を格納
        race_tables = {}
        for race_id in tqdm(race_id_list):
            if race_id in pre_race_tables.index.unique():
                continue
            try:
                time.sleep(1)
                url = "https://db.netkeiba.com/race/" + race_id
                f = urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)
                dfs[1].index = [race_id] * len(dfs[1])
                dfs[2].index = [race_id] * len(dfs[2])
                race_tables[race_id] = pd.concat([dfs[1], dfs[2]])
            except IndexError:
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break
        race_tables_df = pd.concat([race_tables[key] for key in race_tables])
        if len(pre_race_tables):
            return pd.concat([pre_race_tables, race_tables_df])
        else:
            return race_tables_df
    # 関数を引数のように扱うことが可能なclassの手法
    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1, 2]]
        wins = fukusho[1].str.split('br', expand=True)[[0, 1, 2]]
        wins.columns = ['win_0', 'win_1', 'win_2']
        returns = fukusho[2].str.split('br', expand=True)[[0, 1, 2]]
        returns.columns = ['return_0', 'return_1', 'return_2']
        # axis=1:DataFrameを横に接続する(default=0)
        df = pd.concat([wins, returns], axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',', '')
        return df.fillna(0).astype(int)
    @property
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1, 2]]
        tansho.columns = ['win', 'return']
        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
        return tansho
    @property
    def umaren(self):
        umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
        return_ = umaren[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    @property
    def umatan(self):
        umatan = self.return_tables[self.return_tables[0]=='馬単'][[1,2]]
        wins = umatan[1].str.split('→', expand=True)[[0,1]].add_prefix('win_')
        return_ = umatan[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    @property
    def wide(self):
        wide = self.return_tables[self.return_tables[0]=='ワイド'][[1,2]]
        wins = wide[1].str.split('br', expand=True)[[0,1,2]]
        wins = wins.stack().str.split('-', expand=True).add_prefix('win_')
        return_ = wide[2].str.split('br', expand=True)[[0,1,2]]
        return_ = return_.stack().rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce'))
    @property
    def sanrentan(self):
        rentan = self.return_tables[self.return_tables[0]=='三連単'][[1,2]]
        wins = rentan[1].str.split('→', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = rentan[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    @property
    def sanrenpuku(self):
        renpuku = self.return_tables[self.return_tables[0]=='三連複'][[1,2]]
        wins = renpuku[1].str.split('-', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = renpuku[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
# 自動でChromeを開いて検索してくれる。
# Chromeで実際に開いてからじゃないとJavaScriptのデータを取り出せない(らしい)
# googlecolabのやり方(本来はもっと楽にできる。)

class ShutubaTable(DataProcessor):
    def __init__(self, shutuba_tables):
        super(ShutubaTable ,self).__init__()
        self.data = shutuba_tables

    @classmethod
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T
            html = requests.get(url)
            html.encoding = 'EUC-JP'
            soup = BeautifulSoup(html.text, 'html.parser')
            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "重"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                if '稍' in text:
                    df["ground_state"] = ['稍重'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)
            # horse_id
            horse_id_list = list()
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            df['horse_id'] = horse_id_list
            # jockey_id
            jockey_id_list = list()
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['jockey_id'] = jockey_id_list
            df.index = [race_id] * len(df)
            data = data.append(df)
            time.sleep(1)
        return cls(data)
    # 前処理
    def preprocessing(self):
        df = self.data.copy()
        # 性齢を性と年齢に分割
        df['性'] = df['性齢'].map(lambda x: str(x)[0])
        df['年齢'] = df['性齢'].map(lambda x: str(x)[1]).astype(int)
        # 馬体重を現体重と増減に分割
        # expand:Falseなら1列に分割後の配列が格納される
        df = df[df['馬体重(増減)'] != '--']
        df['体重'] = df['馬体重(増減)'].str.split("(", expand=True)[0].astype(int)
        df['体重変化'] = df['馬体重(増減)'].str.split("(", expand=True)[1].str[:-1].astype(int)
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')
        df['date'] = pd.to_datetime(df['date'])
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        # 開催地
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        # 出走数
        df['n_horses'] = df.index.map(df.index.value_counts())
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather', 'race_type', 'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢', '体重', '体重変化', '開催', 'n_horses']]
        self.data_p = df.rename(columns={'枠': '枠番'})
class ModelEvaluator:
    def __init__(self, model, return_tables_path_list):
        self.model = model
        self.rt = Return.read_pickle(return_tables_path_list)
        self.fukusho = self.rt.fukusho
        self.tansho = self.rt.tansho
        self.umaren = self.rt.umaren
        self.umatan = self.rt.umatan
        self.wide = self.rt.wide
        self.sanrentan = self.rt.sanrentan
        self.sanrenpuku = self.rt.sanrenpuku
    def predict_proba(self, X, train=True, std=True, minmax=False):
        if train:
            proba = pd.Series(
                self.model.predict_proba(X.drop(['単勝'], axis=1))[:, 1], index=X.index
            )
        else:
            proba = pd.Series(
                self.model.predict_proba(X, axis=1)[:, 1], index=X.index
            )
        if std:
            #レース内で標準化して、相対評価する。「レース内偏差値」みたいなもの。
            standard_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standard_scaler)
        if minmax:
            #データ全体を0~1にする
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba

    def predict(self, X, threshold=0.5):
        y_pred = self.predict_proba(X)
        self.proba = y_pred
        return [0 if p<threshold else 1 for p in y_pred]
    def score(self, y_true, X):
        return roc_auc_score(y_true, self.predict_proba(X))
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({'features': X.columns,
                                    'importance': self.model.feature_importances_})
        return importances.sort_values("importance", ascending=False)[:n_display]
    def pred_table(self, X, threshold=0.5, bet_only=True):
        pred_table = X.copy()[['馬番', '単勝']]
        pred_table['pred'] = self.predict(X, threshold)
        pred_table['score'] = self.proba
        if bet_only:
            return pred_table[pred_table['pred']==1][['馬番', '単勝', 'score']]
        else:
            return pred_table[['馬番', '単勝', 'score', 'pred']]
    def bet(self, race_id, kind, umaban, amount):
        if kind == 'fukusho':
            rt_1R = self.fukusho.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1', 'win_2']]==umaban).values * rt_1R[['return_0', 'return_1', 'return_2']].values * amount/100
            return_ = np.sum(return_)
        if kind == 'tansho':
            rt_1R = self.tansho.loc[race_id]
            return_ = (rt_1R['win']==umaban) * rt_1R['return'] * amount/100
        if kind == 'umaren':
            rt_1R = self.umaren.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1']]) == set(umaban)) * rt_1R['return']/100 * amount
        if kind == 'umatan':
            rt_1R = self.umatan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1']]) == list(umaban)) * rt_1R['return']/100 * amount
        if kind == 'wide':
            rt_1R = self.wide.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1']].apply(lambda x: set(x)==set(umaban), axis=1)) * rt_1R['return']/100 * amount
            return_ = return_.sum()
        if kind == 'sanrentan':
            rt_1R = self.sanrentan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1', 'win_2']]) == list(umaban)) * rt_1R['return']/100 * amount
        if kind == 'sanrenpuku':
            rt_1R = self.sanrenpuku.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1', 'win_2']]) == set(umaban)) * rt_1R['return']/100 * amount
        if not (return_ >= 0):
            return_ = amount
        return return_
    def fukusho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        return_list = list()
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(np.sum([
                self.bet(race_id, 'fukusho', umaban, 1) for umaban in preds['馬番']
            ]))
        return_rate = np.sum(return_list) / n_bets
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return n_bets, return_rate, n_hits, std
    def tansho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        self.sample = pred_table
        n_bets = len(pred_table)
        return_list = list()
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum([self.bet(race_id, 'tansho', umaban, 1) for umaban in preds['馬番']])
            )
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    def tansho_return_proper(self, X, threshold=0.5):
        #モデルによって、「賭ける」と判断された馬たち
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        return_list = list()
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(np.sum(preds.apply(lambda x: self.bet(race_id, 'tansho', x['馬番'], 1/x['単勝']), axis=1)))
        bet_money = (1 / pred_table['単勝']).sum()
        std = np.std(return_list) * np.sqrt(len(return_list)) / bet_money
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / bet_money
        return n_bets, return_rate, n_hits, std
    def umaren_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umaren', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    def umatan_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue   
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umatan', umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    def wide_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'wide', umaban, 1)
                    n_bets += 1
                    return_list.append(return_)
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std  
def sanrentan_box(self, X, threshold=0.5):
    pred_table = self.pred_table(X, threshold)
    n_bets = 0
    
    return_list = []
    for race_id, preds in pred_table.groupby(level=0):
        return_ = 0
        if len(preds)<3:
            continue
        else:
            for umaban in permutations(preds['馬番'], 3):
                return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                n_bets += 1
            return_list.append(return_)
    std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
    n_hits = np.sum([x>0 for x in return_list])
    return_rate = np.sum(return_list) / n_bets
    return n_bets, return_rate, n_hits, std
def sanrenpuku_box(self, X, threshold=0.5):
    pred_table = self.pred_table(X, threshold)
    n_bets = 0
    return_list = []
    for race_id, preds in pred_table.groupby(level=0):
        return_ = 0
        if len(preds)<3:
            continue
        else:
            for umaban in combinations(preds['馬番'], 3):
                return_ += self.bet(race_id, 'sanrenpuku', umaban, 1)
                n_bets += 1
            return_list.append(return_)
    std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
    n_hits = np.sum([x>0 for x in return_list])
    return_rate = np.sum(return_list) / n_bets
    return n_bets, return_rate, n_hits, std
def umaren_nagashi(self, X, threshold=0.5, n_aite=5):
    pred_table = self.pred_table(X, threshold, bet_only = False)
    n_bets = 0
    return_list = []
    for race_id, preds in pred_table.groupby(level=0):
        return_ = 0
        preds_jiku = preds.query('pred == 1')
        if len(preds_jiku) == 1:
            preds_aite = preds.sort_values('score', ascending = False).iloc[1:(n_aite+1)]['馬番']
            return_ = preds_aite.map(
                lambda x: self.bet(
                    race_id, 'umaren', [preds_jiku['馬番'].values[0], x], 1
                )
            ).sum()
            n_bets += n_aite
            return_list.append(return_)
        elif len(preds_jiku) >= 2:
            for umaban in combinations(preds_jiku['馬番'], 2):
                return_ += self.bet(race_id, 'umaren', umaban, 1)
                n_bets += 1
            return_list.append(return_)
    std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
    n_hits = np.sum([x>0 for x in return_list])
    return_rate = np.sum(return_list) / n_bets
    return n_bets, return_rate, n_hits, std
def umatan_nagashi(self, X, threshold=0.5, n_aite=5):
    pred_table = self.pred_table(X, threshold, bet_only = False)
    n_bets = 0
    
    return_list = []
    for race_id, preds in pred_table.groupby(level=0):
        return_ = 0
        preds_jiku = preds.query('pred == 1')
        if len(preds_jiku) == 1:
            preds_aite = preds.sort_values('score', ascending = False).iloc[1:(n_aite+1)]['馬番']
            return_ = preds_aite.map(
                lambda x: self.bet(
                    race_id, 'umatan', [preds_jiku['馬番'].values[0], x], 1
                )
            ).sum()
            n_bets += n_aite
        elif len(preds_jiku) >= 2:
            for umaban in permutations(preds_jiku['馬番'], 2):
                return_ += self.bet(race_id, 'umatan', umaban, 1)
                n_bets += 1
            return_list.append(return_)
    std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
    n_hits = np.sum([x>0 for x in return_list])
    return_rate = np.sum(return_list) / n_bets
    return n_bets, return_rate, n_hits, std
def wide_nagashi(self, X, threshold=0.5, n_aite=5):
    pred_table = self.pred_table(X, threshold, bet_only = False)
    n_bets = 0
    return_list = []
    for race_id, preds in pred_table.groupby(level=0):
        return_ = 0
        preds_jiku = preds.query('pred == 1')
        if len(preds_jiku) == 1:
            preds_aite = preds.sort_values('score', ascending = False)\
                .iloc[1:(n_aite+1)]['馬番']
            return_ = preds_aite.map(
                lambda x: self.bet(
                    race_id, 'wide', [preds_jiku['馬番'].values[0], x], 1
                )
            ).sum()
            n_bets += len(preds_aite)
            return_list.append(return_)
        elif len(preds_jiku) >= 2:
            for umaban in combinations(preds_jiku['馬番'], 2):
                return_ += self.bet(race_id, 'wide', umaban, 1)
                n_bets += 1
            return_list.append(return_)
    std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
    n_hits = np.sum([x>0 for x in return_list])
    return_rate = np.sum(return_list) / n_bets
    return n_bets, return_rate, n_hits, std
def sanrentan_nagashi(self, X, threshold = 1.5, n_aite=7):
    pred_table = self.pred_table(X, threshold, bet_only = False)
    n_bets = 0
    return_list = []
    for race_id, preds in pred_table.groupby(level=0):
        preds_jiku = preds.query('pred == 1')
        if len(preds_jiku) == 1:
            continue
        elif len(preds_jiku) == 2:
            preds_aite = preds.sort_values('score', ascending = False).iloc[2:(n_aite+2)]['馬番']
            return_ = preds_aite.map(
                lambda x: self.bet(
                    race_id, 'sanrentan',
                    np.append(preds_jiku['馬番'].values, x),
                    1
                )
            ).sum()
            n_bets += len(preds_aite)
            return_list.append(return_)
        elif len(preds_jiku) >= 3:
            return_ = 0
            for umaban in permutations(preds_jiku['馬番'], 3):
                return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                n_bets += 1
            return_list.append(return_)
    std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
    n_hits = np.sum([x>0 for x in return_list])
    return_rate = np.sum(return_list) / n_bets
    return n_bets, return_rate, n_hits, std

# 3: 関数定義

In [6]:
# DataFrameをtrain_dataとtest_dataに分割する関数。
def split_data(df, test_size=0.3):
  sorted_id_list = df.sort_values('date').index.unique()
  train_id_list = sorted_id_list[:round(len(sorted_id_list)*(1-test_size))]
  test_id_list = sorted_id_list[round(len(sorted_id_list)*(1-test_size)):]
  train_data = df.loc[train_id_list]# .drop(['date'], axis=1)
  test_data = df.loc[test_id_list]# .drop(['date'], axis=1)
  return train_data, test_data
def process_categorical(df, target_columns):
  df2 = df.copy()
  for column in target_columns:
    df2[column] = LabelEncoder().fit_transform(df2[column].fillna('Na'))
  # target_columns以外にカテゴリ変数があれば、ダミー変数化する
  df2 = pd.get_dummies(df2)
  for column in target_columns:
    df2[column] = df2[column].astype('category')
  return df2
def gain(return_func, X, n_samples=100, t_range=[0.5, 3.5]):
  gain = {}
  for i in tqdm(range(n_samples)):
    # min_thresholdから1まで、n_samples等分して、thresholdをfor分で回す
    threshold = t_range[1] * i / n_samples + t_range[0] * (1-(i/n_samples))
    n_bets, return_rate, n_hits, std = return_func(X, threshold)
    if n_bets > 2:
      gain[threshold] = {'return_rate': return_rate,
                          'n_hits': n_hits,
                          'std': std,
                          'n_bets': n_bets}
  return pd.DataFrame(gain).T
def update_data(old, new):
  filtered_old = old[~old.index.isin(new.index)]
  return pd.concat([filtered_old, new])
def plot(df, label=' '):
  # 標準偏差で幅をつけて薄くプロット
  plt.fill_between(df.index,
                  y1=df['return_rate']-df['std'],
                  y2=df['return_rate']+df['std'],
                  alpha=0.3) #alphaで透明度を設定
  # plt.fill_between(df['n_bets'],
  #                 y1=df['return_rate']-df['std'],
  #                 y2=df['return_rate']+df['std'],
  #                 alpha=0.3)
  #回収率を実線でプロット
  plt.plot(df.index, df['return_rate'], label=label)
  # plt.plot(df['n_bets'], df['return_rate'], label=label)
  plt.legend() #labelで設定した凡例を表示させる
  plt.grid(True) #グリッドをつける

# 4: データの作成

## 4.1: rr.data_c

In [9]:
rr = RaceResults(race_results)

# 前処理
rr.preprocessing()

# 馬の過去成績の追加
hr = HorseResults(horse_results)
rr.merge_horse_results(hr)

# 5世代分の血統データの追加
p = Peds(ped_results)
p.encode()
rr.merge_ped_results(p.ped_results_e)

# カテゴリ変数の処理
rr.process_categorical()

100%|██████████| 524/524 [00:40<00:00, 13.06it/s]
100%|██████████| 524/524 [00:39<00:00, 13.28it/s]
100%|██████████| 524/524 [00:40<00:00, 12.85it/s]


## 4.2: st.data_c

In [None]:
# 出馬表データのスクレイピング
# 欲しい出馬表のrace_id, 日付を引数とする。
# 2021050405: 2021 10/23 東京
# url = "https://db.netkeiba.com/race/" + race_id

race_id_list = ['2021050406{}'.format(str(i).zfill(2)) for i in range(1, 13)]

In [None]:
st = ShutubaTable.scrape(race_id_list=race_id_list, date='2021/10/30')

In [None]:
# 出馬表データを作成する。

st.preprocessing()

st.merge_horse_results(hr)

st.merge_ped_results(p.ped_results_e)

st.process_categorical(results_m = rr.data_pe)

# 6: 賭け用ソフトウェア

##6.1: optuna

In [None]:
X = rr.data_c.drop(['rank', 'date', '単勝', '体重', '体重変化'], axis=1)
y = rr.data_c['rank']

In [None]:
train, valid = split_data(rr.data_c)

X_train = train.drop(['rank', 'date', '単勝', '体重', '体重変化'], axis=1)
y_train = train['rank']
X_valid = valid.drop(['rank', 'date', '単勝', '体重', '体重変化'], axis=1)
y_valid = valid['rank']

In [2]:
import optuna.integration.lightgbm as lgb_o

lgb_train = lgb_o.Dataset(X_train.values, y_train.values)
lgb_valid = lgb_o.Dataset(X_valid.values, y_valid.values)

# binary: 予測が0 or 1の時に使う。
params = {
    'objective': 'binary',
    'random_state': 100
}

lgb_clf_o = lgb_o.train(params,
                        lgb_train,
                        valid_sets=(lgb_train, lgb_valid),
                        verbose_eval=100,
                        early_stopping_rounds=10
                        )

NameError: name 'X_train' is not defined

In [None]:
lgb_clf_o.params

In [None]:
lgb_clf = lgb.LGBMClassifier(**lgb_clf_o.params)
lgb_clf.fit(X.values, y.values)

## 6.2: 複勝馬の予想

In [None]:
# ModelEvaluator
me = ModelEvaluator(lgb_clf, ['drive/My Drive/Horse_racing/pickle/overall/return_tables.pickle'])
# me = ModelEvaluator(lgb_clf, ['pickle/overall/return_tables.pickle'])

In [None]:
X_fact = st.data_c.drop(['date', '体重', '体重変化'], axis=1)

In [None]:
# MLが予想する信憑性が高い馬n選
# pred = me.predict_proba(st.data_c.drop(['date'], axis=1), train=False)
pred = me.predict_proba(X_fact, train=False)
proba_table = st.data_c[['馬番']].copy()
proba_table['score'] = pred
proba_table.sort_values('score', ascending = False).head(10)

In [None]:
# あるscore以上の馬番のみを出力する。
# 三連複, 三連単: 1.5前後
# 馬連, 馬単: 2.3前後
# 単勝: 3.4前後
pred_table = me.pred_table(X_fact, threshold=1.3, tansho=False, train=False)
for race_id, preds in pred_table.groupby(level=0):
  if len(preds) >= 3: # 馬単, 馬連や三連単, 三連複には不必要なデータを取り除くことが可能
    display(preds)

In [None]:
# 予想に用いられている特徴量
me.feature_importance(X_fact)

## 6.3: 払戻表
※ 既に終了しているレースのみ有効

In [None]:
return_tables_today = Return.scrape(race_id_list)

In [None]:
rt = Return(return_tables_today)

## 6.4: 当選結果
※ 既に終了しているレースのみ有効

In [None]:
# レースデータを抽出

race_results2 = RaceResults.scrape(race_id_list)
rr2 = RaceResults(race_results2)

# 前処理
rr2.preprocessing()

# 馬の過去成績の追加
horse_id_list2 = race_results2['horse_id'].unique()
hr2 = HorseResults(HorseResults.scrape(horse_id_list2))
rr2.merge_horse_results(hr)

# 5世代分の血統データの追加
p2 = Peds(Peds.scrape(horse_id_list2))
p2.encode()
rr2.merge_ped_results(p.ped_results_e)

# カテゴリ変数の処理
rr2.process_categorical()

In [None]:
# 計算の都合に合わせたデータの加工
X_results = rr2.data_c.drop(['rank', 'date'], axis=1)

In [None]:
# 三連複の結果の計算
temp = gain(me.sanrenpuku_box, X_results)

In [None]:
# thresholdごとの回収率
plot(temp, 'today')

In [None]:
# n_bets, return_rate, n_hits, std
me.sanrenpuku_box(X_fact, threshold=1.2, show=True)

#7: 交差検証ソフトウェア

In [None]:
# 訓練データ、検証データ、テストデータに分ける
train, test = split_data(rr.data_c)
train, valid = split_data(train)

In [None]:
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
X_valid = valid.drop(['rank', 'date', '単勝'], axis=1)
y_valid = valid['rank']

In [None]:
import optuna.integration.lightgbm as lgb_o

lgb_train = lgb_o.Dataset(X_train.values, y_train.values)
lgb_valid = lgb_o.Dataset(X_valid.values, y_valid.values)

# binary: 予測が0 or 1の時に使う。
params = {
    'objective': 'binary',
    'random_state': 100
}

lgb_clf_o = lgb_o.train(params,
                        lgb_train,
                        valid_sets=(lgb_train, lgb_valid),
                        verbose_eval=100,
                        early_stopping_rounds=10
                        )

  * feature_pre_filter: (LightGBM 3.0からできたもの。)min_data_in_leaf=min_child_samplesをチューニングする時にはFalseにする。
  * lambda_l1, lambda_l2: 正則化。過学習を防止する。
  * num_leaves: 葉の数
  * feature_fraction: 特徴量の60%だけを選んで一つの木を育てる。
  * bagging_fraction: データを100%使って一つの木を育てる。
  * bagging_freq: バギング(データの水増し)する頻度。今回はバギングしない。
  * min_child_samples: 最終的に一つの葉に残るデータ数。

In [None]:
lgb_clf_o.params

In [None]:
# 時系列に沿って訓練データとテストデータに分ける
train, test = split_data(rr.data_c)
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
X_test = test.drop(['rank', 'date'], axis=1)
y_test = test['rank']

In [None]:
lgb_clf = lgb.LGBMClassifier(**lgb_clf_o.params)
lgb_clf.fit(X_train.values, y_train.values)

In [None]:
# ModelEvaluator
me = ModelEvaluator(lgb_clf, ['drive/My Drive/Horse_racing/pickle/overall/return_tables.pickle'])

In [None]:
# 予想に用いられている特徴量
me.feature_importance(X_test.drop(['単勝'], axis=1))

In [None]:
# 過去の交差検証結果
# g_tansho=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_tansho.pickle')
# g_proper=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_proper.pickle')
# g_umaren=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_umaren.pickle')
# g_umatan=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_umatan.pickle')
# g_wide=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_wide.pickle')
# g_sanrentan=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_sanrentan.pickle')
# g_sanrenpuku=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_sanrenpuku.pickle')
# g_umaren_nagashi=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_umaren_nagashi.pickle')
# g_umatan_nagashi=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_umatan_nagashi.pickle')
# g_wide_nagashi=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_wide_nagashi.pickle')
# g_sanrentan_nagashi=pd.read_pickle('drive/My Drive/Horse_racing/pickle/results/g_sanrentan_nagashi.pickle')

In [None]:
# 回収率の計算
# g_fukusho = gain(me.fukusho_return, X_test)
# g_proper = gain(me.tansho_return_proper, X_test)
# g_tansho = gain(me.tansho_return, X_test)
# g_umaren = gain(me.umaren_box, X_test)
# g_umatan = gain(me.umatan_box, X_test)
# g_wide = gain(me.wide_box, X_test)
# g_sanrentan = gain(me.sanrentan_box, X_test)
# g_sanrenpuku = gain(me.sanrenpuku_box, X_test)

# g_umaren_nagashi = gain(me.umaren_nagashi, X_test)
# g_umatan_nagashi = gain(me.umatan_nagashi, X_test)
# g_wide_nagashi = gain(me.wide_nagashi, X_test)
# g_sanrentan_nagashi = gain(me.sanrentan_nagashi, X_test)

In [None]:
# 横軸: 閾値
# 縦軸: 回収率

plt.figure(figsize=(10, 8))
plot(g_tansho, 'tansho')
plot(g_proper, 'proper')
plot(g_umaren, 'umaren')
plot(g_umatan, 'umatan')
plot(g_wide, 'wide')
plot(g_sanrentan, 'sanrentan')
plot(g_sanrenpuku, 'sanrenpuku')

In [None]:
# 横軸: 閾値
# 縦軸: 回収率

plt.figure(figsize=(10, 8))
plot(g_umaren_nagashi, 'umaren_nagashi')
plot(g_umatan_nagashi, 'umatan_nagashi')
plot(g_wide_nagashi, 'wide_nagashi')
plot(g_sanrentan_nagashi, 'sanrentan_nagashi')

In [None]:
# 横軸: 賭け枚数
# 縦軸: 回収率
# 賭け枚数が少ない→一つの当たりの影響が大きい→賭け枚数が多いところでの期待値も＋になることが理想。

plt.figure(figsize=(10, 8))
plt.plot(g_proper['n_bets'], g_proper['return_rate'], label='proper')
plt.plot(g_tansho['n_bets'], g_tansho['return_rate'], label='tansho')
plt.plot(g_umaren['n_bets'], g_umaren['return_rate'], label='umaren')
plt.plot(g_umatan['n_bets'], g_umatan['return_rate'], label='umatan')
plt.plot(g_wide['n_bets'], g_wide['return_rate'], label='wide')
plt.plot(g_sanrentan['n_bets'], g_sanrentan['return_rate'], label='sanrentan')
plt.plot(g_sanrenpuku['n_bets'], g_sanrenpuku['return_rate'], label='sarenpuku')
plt.hlines(1, xmin=-10, xmax=2000) # y=1の直線
plt.xlim(-10, 2000)
plt.legend()
plt.grid(True)

In [None]:
# 横軸: 賭け枚数
# 縦軸: 回収率

plt.figure(figsize=(10, 8))
plt.plot(g_umaren_nagashi['n_bets'], g_umaren_nagashi['return_rate'], label='umaren_nagashi')
plt.plot(g_umatan_nagashi['n_bets'], g_umatan_nagashi['return_rate'], label='umatan_nagashi')
plt.plot(g_wide_nagashi['n_bets'], g_wide_nagashi['return_rate'], label='wide_nagashi')
plt.plot(g_sanrentan_nagashi['n_bets'], g_sanrentan_nagashi['return_rate'], label='sanrentan_nagashi')
plt.hlines(1, xmin=-10, xmax=2000) # y=1の直線
plt.xlim(-10, 2000)
plt.legend()
plt.grid(True)

In [None]:
# シャープレシオ

plt.figure(figsize=(10, 8))
plt.plot(g_proper['n_bets'], (g_proper['return_rate'] - 1) / g_proper['std'], label='proper')
plt.plot(g_tansho['n_bets'], (g_tansho['return_rate'] - 1) / g_tansho['std'], label='tansho')
plt.plot(g_umaren['n_bets'], (g_umaren['return_rate'] - 1) / g_umaren['std'], label='umaren')
plt.plot(g_umatan['n_bets'], (g_umatan['return_rate'] - 1) / g_umatan['std'], label='umatan')
plt.plot(g_wide['n_bets'], (g_wide['return_rate'] - 1) / g_wide['std'], label='wide')
plt.plot(g_sanrentan['n_bets'], (g_sanrentan['return_rate'] - 1) / g_sanrentan['std'], label='sanrentan')
plt.plot(g_sanrenpuku['n_bets'], (g_sanrenpuku['return_rate'] - 1) / g_sanrenpuku['std'], label='sanrenpuku')
plt.xlim(-10, 200)
plt.ylim(-10, 5)
plt.legend()
plt.grid()

In [None]:
# シャープレシオ

plt.figure(figsize=(10, 8))
plt.plot(g_umaren_nagashi['n_bets'], (g_umaren_nagashi['return_rate'] - 1) / g_umaren_nagashi['std'], label='umaren_nagashi')
plt.plot(g_umatan_nagashi['n_bets'], (g_umatan_nagashi['return_rate'] - 1) / g_umatan_nagashi['std'], label='umatan_nagashi')
plt.plot(g_wide_nagashi['n_bets'], (g_wide_nagashi['return_rate'] - 1) / g_wide_nagashi['std'], label='wide_nagashi')
plt.plot(g_sanrentan_nagashi['n_bets'], (g_sanrentan_nagashi['return_rate'] - 1) / g_sanrentan_nagashi['std'], label='sanrentan_nagashi')
plt.xlim(-10, 2000)
plt.ylim(-10, 5)
plt.legend()
plt.grid()