## CSV出力

### 必須モジュールをインポート

In [1]:
import csv
import requests
import bs4
import re
import os
import numpy as np
import pandas as pd
from enum import Enum
import derby_func

### 定数宣言

In [2]:
TRAIN_RACE_ID = "202105030211"
TEST_RACE_ID = "202109030411"
HORSE_ID = "/horse/2018105233"

RACE = "/race/"
CSV_DIR = "csv"
URL_BASE = "https://db.netkeiba.com"
URL_RACE = "https://race.netkeiba.com/race/shutuba.html?race_id="

RACE_TABLE_NAME = "race_table_01 nk_tb_common"
HORSE_TABLE_NAME = "db_h_race_results nk_tb_common"
 
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36   '
}

In [3]:
class RaceRank(object):
    def __init__(self, id, score, rank):
        self.id = id
        self.score = score
        self.rank = rank

In [4]:
RACE_RANK = []
RACE_RANK.append(RaceRank(1, 100, "G1"))
RACE_RANK.append(RaceRank(2, 70, "G2"))
RACE_RANK.append(RaceRank(3, 50, "G3"))
RACE_RANK.append(RaceRank(4, 40, "L"))
RACE_RANK.append(RaceRank(5, 40, "OP"))
RACE_RANK.append(RaceRank(6, 30, "3勝"))
RACE_RANK.append(RaceRank(7, 30, "1600万下"))
RACE_RANK.append(RaceRank(8, 20, "2勝"))
RACE_RANK.append(RaceRank(9, 20, "1000万下"))
RACE_RANK.append(RaceRank(10, 10, "1勝"))
RACE_RANK.append(RaceRank(11, 10, "500万下"))
RACE_RANK.append(RaceRank(12, 5, "未勝利"))
RACE_RANK.append(RaceRank(13, 5, "新馬"))

In [5]:
PLACE = (
    "札幌",
    "函館",
    "福島",
    "中山",
    "東京",
    "新潟",
    "中京",
    "京都",
    "阪神",
    "小倉"
)

In [6]:
WEATHER = (
    "晴",
    "曇",
    "小雨",
    "小雪",
    "雨",
    "雪"
)

In [7]:
GROUND_STATE = (
    "良",
    "稍",
    "重",
    "不"
)

### URLからText取得

In [8]:
def get_text_from_page(url):
     
    try:
        res = requests.get(url, headers=HEADERS)
        res.encoding = res.apparent_encoding  
        text = res.text
         
        return text
    except:
        return None

### TextをまとめてCSVに出力

In [9]:
def get_old_race_info_from_text(header_flg, text, table_name, race_id, race_name):
    try:  
        info = []
        horse_id = []
        horse_names = []
        soup = bs4.BeautifulSoup(text, features='lxml')
        base_elem = soup.find(class_=table_name)
        elems = base_elem.find_all("tr")
        for elem in elems:
            row_info = []
            r_class = elem.get("class")
            r_cols = None
            if r_class==None:
                r_cols = elem.find_all("td")
            else:
                if header_flg:
                    r_cols = elem.find_all("th")
            if not r_cols==None:
                for r_col in r_cols:
                    scores = []
                    tmp_text = r_col.text
                    link = r_col.find("a")
                    if not link==None:
                        tmp = link.get('href')
                        if 'horse' in tmp and not '?pid' in tmp:
                            horse_id.append(tmp[:-1])
                            (name, scores) = get_horse_data(tmp[:-1], race_id, race_name)
                            horse_names.append(name)
                    tmp_text = tmp_text.replace("\n", "")
                    row_info.append(tmp_text.strip())
                    for score in scores:
                        row_info.append(score[0])
                info.append(row_info)
        return (info, horse_id, horse_names)
    except:
        print("err")
        return None

In [10]:
def get_race_info_from_text(text, table_name, race_id, race_name):
    try:  
        info = []
        horse_id = []
        horse_names = []
        soup = bs4.BeautifulSoup(text, features='lxml')
        base_elem = soup.find(class_=table_name)
        elems = base_elem.find_all("tr")
        for elem in elems:
            row_info = []
            scores = []
            r_cols = elem.find_all("td")
            if not r_cols==None:
                for r_col in r_cols:
                    if (not r_col==None):
                        tmp_text = r_col.text
                        tmp_text = tmp_text.replace("\n", "")
                        row_info.append(tmp_text.strip())
                        links = r_col.find_all("a")
                        for link in links:
                            if not link==None:
                                r_a = link.get("href")
                                if "horse/" in r_a:
                                    row_info.append(r_a)
                                    horse_id.append(r_a[23:])
                                    (name, scores) = get_horse_data(r_a[23:], race_id, race_name)
                                    horse_names.append(name)
                                    for score in scores:
                                        row_info.append(score[0])
            info.append(row_info)
        return (info, horse_id, horse_names)
    except:
        print("err")
        return None

In [11]:
def get_horse_info_from_text(header_flg, text, table_name):
    try:
        info = []
        soup = bs4.BeautifulSoup(text, features='lxml')
        base_elem = soup.find(class_=table_name)
        elems = base_elem.find_all("tr")
        param = soup.find(class_="db_prof_box")
        params = param.find_all("img")
        score = []
        score_2 = []
        for prm in params:
            score.append(prm.get("width"))
        if (len(score) > 18):
            for i in range(5):
                score_2.append([score[i * 5 + 1], score[i * 5 + 3]])
        for elem in elems:
            race_id = ""
            row_info = []
            r_class = elem.get("class")
            r_cols = None
            if r_class==None:
                r_cols = elem.find_all("td")
            else:
                if header_flg:
                    r_cols = elem.find_all("th")
            if not r_cols==None:
                for r_col in r_cols:
                    links = r_col.find_all("a")
                    print(r_col)
                    for link in links:
                        if (not link==None):
                            tmp = link.get('href')
                            if '/race/' in tmp and not 'sum' in tmp and not 'list' in tmp and not 'movie' in tmp:
                                race_id = str(tmp)
                    tmp_text = r_col.text
                    tmp_text = tmp_text.replace("\n", "")
                    row_info.append(tmp_text.strip())
                if not race_id=="":
                    row_info.append(race_id[6:len(race_id) - 1])
                info.append(row_info)
        return (info, score_2)
    except:
        print("err")
        return None

In [12]:
def get_name_from_text(text):
    try:
        soup = bs4.BeautifulSoup(text, features='lxml')
        title_text = soup.find('title').get_text()
        return title_text
    except:
        print("err")
        return None

### ホースデータの取得

In [13]:
def get_horse_data(horse_id, race_id, race_name):
    URL_BASE = "https://db.netkeiba.com"
    HORSE_TABLE_NAME = "db_h_race_results nk_tb_common"
    url = URL_BASE + horse_id
    text = get_text_from_page(url)
    (info, score) = get_horse_info_from_text(False, text, HORSE_TABLE_NAME)
    tmp = get_name_from_text(text)
    if not tmp==None:
        horse_name = tmp.replace('競馬データベース - netkeiba.com', '').split(' ')
        print(horse_name[0])
        file_path = "csv/horse/" + race_id[0:4] + race_name + "/" + horse_name[0] + ".csv"
        with open(file_path, "w", newline="", encoding='shift_jis') as f:
            writer = csv.writer(f)
            writer.writerows(info)
        return (horse_name[0], score)

### レースデータの取得

すでに結果が出ているレース用

In [14]:
"""
if __name__ == '__main__':
    url = URL_BASE + RACE + TRAIN_RACE_ID
    text = get_text_from_page(url)
    race_name = get_name_from_text(text).replace('競馬データベース - netkeiba.com', '').split('｜')
    if not os.path.exists(CSV_DIR + "/horse/" + TRAIN_RACE_ID[0:4] + race_name[0]):
        os.makedirs(CSV_DIR + "/horse/" + TRAIN_RACE_ID[0:4] + race_name[0])
    (info, horse_id, horse_names) = get_old_race_info_from_text(False, text, RACE_TABLE_NAME, TRAIN_RACE_ID, race_name[0])
    file_path = CSV_DIR+ RACE + TRAIN_RACE_ID[0:4] + race_name[0] + ".csv"
    with open(file_path, "w", newline="", encoding='shift_jis') as f:
        writer = csv.writer(f)
        writer.writerows(info)
"""

'\nif __name__ == \'__main__\':\n    url = URL_BASE + RACE + TRAIN_RACE_ID\n    text = get_text_from_page(url)\n    race_name = get_name_from_text(text).replace(\'競馬データベース - netkeiba.com\', \'\').split(\'｜\')\n    if not os.path.exists(CSV_DIR + "/horse/" + TRAIN_RACE_ID[0:4] + race_name[0]):\n        os.makedirs(CSV_DIR + "/horse/" + TRAIN_RACE_ID[0:4] + race_name[0])\n    (info, horse_id, horse_names) = get_old_race_info_from_text(False, text, RACE_TABLE_NAME, TRAIN_RACE_ID, race_name[0])\n    file_path = CSV_DIR+ RACE + TRAIN_RACE_ID[0:4] + race_name[0] + ".csv"\n    with open(file_path, "w", newline="", encoding=\'shift_jis\') as f:\n        writer = csv.writer(f)\n        writer.writerows(info)\n'

これから結果が出るレース用

In [15]:
if __name__ == '__main__':
    url = URL_RACE + TEST_RACE_ID
    text = get_text_from_page(url)
    race_name = get_name_from_text(text).replace(' - netkeiba.com', '').split('(')
    if not os.path.exists(CSV_DIR + "/horse/" + TEST_RACE_ID[0:4] + race_name[0]):
        os.makedirs(CSV_DIR + "/horse/" + TEST_RACE_ID[0:4] + race_name[0])
    (info, horse_id, horse_names) = get_race_info_from_text(text, "Shutuba_Table RaceTable01 ShutubaTable", TEST_RACE_ID[0:4], race_name[0])
    file_path = CSV_DIR+ RACE + TEST_RACE_ID[0:4] + race_name[0] + ".csv"
    with open(file_path, "w", newline="", encoding='shift_jis') as f:
        writer = csv.writer(f)
        writer.writerows(info)

<td><a href="/race/list/20210605/">2021/06/05</a></td>
<td><a href="/race/sum/07/20210605/">4中京1</a></td>
<td>曇</td>
<td class="txt_right">11</td>
<td class="rank_3"><a href="/race/202107040111/" title="鳴尾記念(G3)">鳴尾記念(G3)</a></td>
<td>
<a href="/race/movie/202107040111" target="_blank" title="鳴尾記念(G3)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">13</td>
<td class="txt_right">3</td>
<td class="txt_right">3</td>
<td class="txt_right">20.6</td>
<td class="txt_right">8</td>
<td class="rank_1 txt_right">1</td>
<td>
<a href="/jockey/01163/" title="坂井瑠星">坂井瑠星</a>
</td>
<td>56</td>
<td>芝2000</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">2:00.7</td>
<td class="txt_right">-0.6</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" href="h

<td><a href="/race/list/20210404/">2021/04/04</a></td>
<td><a href="/race/sum/09/20210404/">2阪神4</a></td>
<td>雨</td>
<td class="txt_right">11</td>
<td class="rank_1"><a href="/race/202109020411/" title="大阪杯(G1)">大阪杯(G1)</a></td>
<td>
<a href="/race/movie/202109020411" target="_blank" title="大阪杯(G1)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">13</td>
<td class="txt_right">6</td>
<td class="txt_right">8</td>
<td class="txt_right">12.2</td>
<td class="txt_right">4</td>
<td class="rank_1 txt_right">1</td>
<td>
<a href="/jockey/01088/" title="川田将雅">川田将雅</a>
</td>
<td>55</td>
<td>芝2000</td>
<td>重</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">2:01.6</td>
<td class="txt_right">-0.7</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" href="http

<td><a href="/race/list/20210612/">2021/06/12</a></td>
<td><a href="/race/sum/05/20210612/">3東京3</a></td>
<td>晴</td>
<td class="txt_right">11</td>
<td class=""><a href="/race/202105030311/" title="ジューンS(3勝クラス)">ジューンS(3勝クラス)</a></td>
<td>
<a href="/race/movie/202105030311" target="_blank" title="ジューンS(3勝クラス)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">11</td>
<td class="txt_right">6</td>
<td class="txt_right">7</td>
<td class="txt_right">10.0</td>
<td class="txt_right">5</td>
<td class="bml txt_right">10</td>
<td>
<a href="/jockey/05386/" title="戸崎圭太">戸崎圭太</a>
</td>
<td>54</td>
<td>芝2400</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">2:26.7</td>
<td class="txt_right">2.1</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" href

<td><a href="/race/list/20210605/">2021/06/05</a></td>
<td><a href="/race/sum/07/20210605/">4中京1</a></td>
<td>曇</td>
<td class="txt_right">11</td>
<td class="rank_3"><a href="/race/202107040111/" title="鳴尾記念(G3)">鳴尾記念(G3)</a></td>
<td>
<a href="/race/movie/202107040111" target="_blank" title="鳴尾記念(G3)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">13</td>
<td class="txt_right">4</td>
<td class="txt_right">5</td>
<td class="txt_right">162.6</td>
<td class="txt_right">12</td>
<td class="bml txt_right">5</td>
<td>
<a href="/jockey/01154/" title="松若風馬">松若風馬</a>
</td>
<td>56</td>
<td>芝2000</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">2:01.6</td>
<td class="txt_right">0.9</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" href="htt

ワイプティアーズ
<td><a href="/race/list/20210530/">2021/05/30</a></td>
<td><a href="/race/sum/05/20210530/">2東京12</a></td>
<td>晴</td>
<td class="txt_right">12</td>
<td class="rank_2"><a href="/race/202105021212/" title="目黒記念(G2)">目黒記念(G2)</a></td>
<td>
<a href="/race/movie/202105021212" target="_blank" title="目黒記念(G2)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">16</td>
<td class="txt_right">2</td>
<td class="txt_right">3</td>
<td class="txt_right">170.7</td>
<td class="txt_right">15</td>
<td class="rank_3 txt_right">3</td>
<td>
<a href="/jockey/01043/" title="北村宏司">北村宏司</a>
</td>
<td>53</td>
<td>芝2500</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">2:33.4</td>
<td class="txt_right">0.6</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="070

<td><a href="/race/list/20210502/">2021/05/02</a></td>
<td><a href="/race/sum/09/20210502/">2阪神12</a></td>
<td>晴</td>
<td class="txt_right">11</td>
<td class="rank_1"><a href="/race/202109021211/" title="天皇賞(春)(G1)">天皇賞(春)(G1)</a></td>
<td>
<a href="/race/movie/202109021211" target="_blank" title="天皇賞(春)(G1)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">17</td>
<td class="txt_right">2</td>
<td class="txt_right">4</td>
<td class="txt_right">182.3</td>
<td class="txt_right">17</td>
<td class="bml txt_right">16</td>
<td>
<a href="/jockey/01154/" title="松若風馬">松若風馬</a>
</td>
<td>58</td>
<td>芝3200</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">3:18.4</td>
<td class="txt_right">3.7</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" h

シロニイ
<td><a href="/race/list/20210327/">2021/03/27</a></td>
<td><a href="/race/sum/J0/20210327/">メイダン</a></td>
<td> </td>
<td class="txt_right"> </td>
<td class="rank_1"><a href="/race/2021J0032708/" title="ドバイシーマクラシッ(G1)">ドバイシーマクラシッ(G1)</a></td>
<td>

 
</td>
<td class="txt_right">9</td>
<td class="txt_right"> </td>
<td class="txt_right">9</td>
<td class="txt_right">1.9</td>
<td class="rank_1 txt_right">1</td>
<td class="rank_2 txt_right">2</td>
<td>
<a href="/jockey/01102/" title="北村友一">北村友一</a>
</td>
<td>55</td>
<td>芝2410</td>
<td>良</td>
<td class="bml txt_right">


 
</td>
<td class="txt_right"> </td>
<td class="txt_right"> </td>
<td class="bml txt_right">


 
</td>
<td> </td>
<td> </td>
<td class="bml"> </td>
<td>計不</td>
<td align="center" class="bml" nowrap="">

 
</td>
<td class="bml" nowrap="">


 
</td>
<td> </td>
<td> </td>
<td><a href="/race/list/20201227/">2020/12/27</a></td>
<td><a href="/race/sum/06/20201227/">5中山8</a></td>
<td>晴</td>
<td class="txt_right">11</td>
<td cla

<td><a href="/race/list/20210606/">2021/06/06</a></td>
<td><a href="/race/sum/05/20210606/">3東京2</a></td>
<td>曇</td>
<td class="txt_right">11</td>
<td class="rank_1"><a href="/race/202105030211/" title="安田記念(G1)">安田記念(G1)</a></td>
<td>
<a href="/race/movie/202105030211" target="_blank" title="安田記念(G1)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">14</td>
<td class="txt_right">6</td>
<td class="txt_right">10</td>
<td class="txt_right">167.9</td>
<td class="txt_right">14</td>
<td class="bml txt_right">6</td>
<td>
<a href="/jockey/00666/" title="武豊">武豊</a>
</td>
<td>58</td>
<td>芝1600</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">1:32.2</td>
<td class="txt_right">0.5</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" href="https:

カデナ
<td><a href="/race/list/20210502/">2021/05/02</a></td>
<td><a href="/race/sum/09/20210502/">2阪神12</a></td>
<td>晴</td>
<td class="txt_right">11</td>
<td class="rank_1"><a href="/race/202109021211/" title="天皇賞(春)(G1)">天皇賞(春)(G1)</a></td>
<td>
<a href="/race/movie/202109021211" target="_blank" title="天皇賞(春)(G1)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">17</td>
<td class="txt_right">1</td>
<td class="txt_right">2</td>
<td class="txt_right">3.8</td>
<td class="rank_2 txt_right">2</td>
<td class="bml txt_right">4</td>
<td>
<a href="/jockey/05339/" title="ルメール">ルメール</a>
</td>
<td>58</td>
<td>芝3200</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">3:15.2</td>
<td class="txt_right">0.5</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="0

<td><a href="/race/list/20210502/">2021/05/02</a></td>
<td><a href="/race/sum/09/20210502/">2阪神12</a></td>
<td>晴</td>
<td class="txt_right">11</td>
<td class="rank_1"><a href="/race/202109021211/" title="天皇賞(春)(G1)">天皇賞(春)(G1)</a></td>
<td>
<a href="/race/movie/202109021211" target="_blank" title="天皇賞(春)(G1)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">17</td>
<td class="txt_right">2</td>
<td class="txt_right">3</td>
<td class="txt_right">7.3</td>
<td class="txt_right">4</td>
<td class="rank_3 txt_right">3</td>
<td>
<a href="/jockey/05386/" title="戸崎圭太">戸崎圭太</a>
</td>
<td>56</td>
<td>芝3200</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">3:15.2</td>
<td class="txt_right">0.5</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" hr

<td><a href="/race/list/20210404/">2021/04/04</a></td>
<td><a href="/race/sum/09/20210404/">2阪神4</a></td>
<td>雨</td>
<td class="txt_right">11</td>
<td class="rank_1"><a href="/race/202109020411/" title="大阪杯(G1)">大阪杯(G1)</a></td>
<td>
<a href="/race/movie/202109020411" target="_blank" title="大阪杯(G1)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">13</td>
<td class="txt_right">1</td>
<td class="txt_right">1</td>
<td class="txt_right">68.8</td>
<td class="txt_right">6</td>
<td class="rank_2 txt_right">2</td>
<td>
<a href="/jockey/01032/" title="池添謙一">池添謙一</a>
</td>
<td>57</td>
<td>芝2000</td>
<td>重</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">2:02.3</td>
<td class="txt_right">0.7</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" href="https

<td><a href="/race/list/20210530/">2021/05/30</a></td>
<td><a href="/race/sum/05/20210530/">2東京12</a></td>
<td>晴</td>
<td class="txt_right">12</td>
<td class="rank_2"><a href="/race/202105021212/" title="目黒記念(G2)">目黒記念(G2)</a></td>
<td>
<a href="/race/movie/202105021212" target="_blank" title="目黒記念(G2)の映像"><img border="0" src="/style/netkeiba.ja/image/icon_douga.png"/></a>
</td>
<td class="txt_right">16</td>
<td class="txt_right">8</td>
<td class="txt_right">15</td>
<td class="txt_right">7.8</td>
<td class="rank_3 txt_right">3</td>
<td class="bml txt_right">12</td>
<td>
<a href="/jockey/01014/" title="福永祐一">福永祐一</a>
</td>
<td>54</td>
<td>芝2500</td>
<td>良</td>
<td class="bml txt_right">
<div align="center">
<a data-theme="07002" href="https://regist.netkeiba.com/?pid=premium&amp;service=p44" id="a_monthly_goods_link_01">**</a>
</div>
</td>
<td class="txt_right">2:34.2</td>
<td class="txt_right">1.4</td>
<td class="bml txt_right">
<div style="text-align:center;">
<a data-theme="07003" hr

ミスマンマミーア
<td><a href="/race/list/20210425/">2021/04/25</a></td>
<td><a href="/race/sum/H1/20210425/">シャティ</a></td>
<td> </td>
<td class="txt_right"> </td>
<td class="rank_1"><a href="/race/2021H1042508/" title="QE2世C(G1)">QE2世C(G1)</a></td>
<td>

 
</td>
<td class="txt_right">7</td>
<td class="txt_right"> </td>
<td class="txt_right">4</td>
<td class="txt_right">14.5</td>
<td class="txt_right">5</td>
<td class="bml txt_right">4</td>
<td>
<a href="/jockey/05570/" title="スコフィ">スコフィ</a>
</td>
<td>57</td>
<td>芝2000</td>
<td>良</td>
<td class="bml txt_right">


 
</td>
<td class="txt_right">2:01.67</td>
<td class="txt_right"> </td>
<td class="bml txt_right">


 
</td>
<td> </td>
<td> </td>
<td class="bml"> </td>
<td>504</td>
<td align="center" class="bml" nowrap="">

 
</td>
<td class="bml" nowrap="">


 
</td>
<td> </td>
<td> </td>
<td><a href="/race/list/20210314/">2021/03/14</a></td>
<td><a href="/race/sum/07/20210314/">2中京2</a></td>
<td>晴</td>
<td class="txt_right">11</td>
<td class="rank

## DF操作

### 定数宣言

In [16]:
TRAIN_RACE_DATA_PATH = CSV_DIR+ RACE + TRAIN_RACE_ID[0:4] + race_name[0] + ".csv"
TRAIN_HORSE_DATA_PATH = CSV_DIR + "/horse/" + TRAIN_RACE_ID[0:4] + race_name[0] + "/"
TEST_RACE_DATA_PATH = CSV_DIR+ RACE + TEST_RACE_ID[0:4] + race_name[0] + ".csv"
TEST_HORSE_DATA_PATH = CSV_DIR + "/horse/" + TEST_RACE_ID[0:4] + race_name[0] + "/"
HORSE_DATA_COLUMNS = (
    "date",
    "place",
    "weather",
    "R",
    "race_name",
    "video",
    "head_count",
    "frame_number",
    "horce_number",
    "odds",
    "popularity",
    "rank",
    "jockey",
    "weight",
    "distance",
    "ground",
    "ground_score",
    "time",
    "difference",
    "time_score",
    "lap_time",
    "pace",
    "final_3F",
    "horce_weight",
    "coment",
    "coment_2",
    "winner",
    "reward",
    "race_id"
)

In [17]:
df_horces = []
df_race = pd.read_csv(TEST_RACE_DATA_PATH, header=None, encoding='shift_jis')
for name in horse_names:
    df_horces.append(pd.read_csv(TEST_HORSE_DATA_PATH + name + ".csv", header=None, names = HORSE_DATA_COLUMNS, encoding='shift_jis'))

### DF確認

In [18]:
df_race

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1,1,--◎◯▲△☆&#10003消,ユニコーンライオン,https://db.netkeiba.com/horse/2016110103,87,58,116,58,58,牡5,58.0,坂井,栗東矢作,,---.-,**,,
1,2,2,--◎◯▲△☆&#10003消,レイパパレ,https://db.netkeiba.com/horse/2017105335,111,58,116,46,103,牝4,56.0,川田,栗東高野,,---.-,**,,
2,3,3,--◎◯▲△☆&#10003消,メロディーレーン,https://db.netkeiba.com/horse/2016105526,116,2,1,12,75,牝5,56.0,幸,栗東森田,,---.-,**,,
3,4,4,--◎◯▲△☆&#10003消,ワイプティアーズ,https://db.netkeiba.com/horse/2015104444,116,58,1,58,58,牡6,58.0,和田竜,栗東加用,,---.-,**,,
4,4,5,--◎◯▲△☆&#10003消,アドマイヤアルバ,https://db.netkeiba.com/horse/2015104713,116,58,29,116,58,セ6,58.0,酒井,美浦柄崎,,---.-,**,,
5,5,6,--◎◯▲△☆&#10003消,シロニイ,https://db.netkeiba.com/horse/2014105947,29,92,92,87,34,牡7,58.0,松若,栗東池江,,---.-,**,,
6,5,7,--◎◯▲△☆&#10003消,クロノジェネシス,https://db.netkeiba.com/horse/2016104750,116,53,38,27,104,牝5,56.0,ルメール,栗東斉藤崇,,---.-,**,,
7,6,8,--◎◯▲△☆&#10003消,カデナ,https://db.netkeiba.com/horse/2014104449,107,65,14,79,54,牡7,58.0,松山,栗東中竹,,---.-,**,,
8,6,9,--◎◯▲△☆&#10003消,アリストテレス,https://db.netkeiba.com/horse/2017105525,116,6,64,33,51,牡4,58.0,武豊,栗東音無,,---.-,**,,
9,7,10,--◎◯▲△☆&#10003消,カレンブーケドール,https://db.netkeiba.com/horse/2016105089,116,32,87,87,92,牝5,56.0,戸崎圭,美浦国枝,,---.-,**,,


In [19]:
df_horces[0]

Unnamed: 0,date,place,weather,R,race_name,video,head_count,frame_number,horce_number,odds,...,time_score,lap_time,pace,final_3F,horce_weight,coment,coment_2,winner,reward,race_id
0,2021/06/05,4中京1,曇,11,鳴尾記念(G3),,13,3,3,20.6,...,**,1-1-1-1,38.0-34.1,34.1,528(-4),,,(ショウナンバルディ),4145.5,202107040111
1,2021/05/16,2新潟4,雨,11,弥彦S(3勝クラス),,14,5,8,12.9,...,**,2-2,36.7-34.8,34.8,532(+4),,,(コマノウインクル),1850.8,202104020411
2,2021/05/01,2阪神11,雨,10,ストークS(3勝クラス),,18,8,17,215.3,...,**,7-8,34.9-34.0,33.5,528(+4),,,グランデマーレ,465.4,202109021110
3,2020/08/15,2札幌1,晴,11,TVh賞(3勝クラス),,14,4,5,24.9,...,**,2-2-2-2,29.8-36.8,39.5,524(+12),,,ソリストサンダー,,202001020111
4,2020/08/01,1札幌3,晴,11,STV賞(3勝クラス),,14,1,1,19.3,...,**,2-2-3-3,36.5-35.4,35.8,512(-16),,,ハナズレジェンド,,202001010311
5,2020/04/12,2阪神6,雨,12,梅田S(3勝クラス),,16,5,9,6.8,...,**,2-2-2-2,36.1-37.3,43.1,528(+4),,,コンカラー,,202009020612
6,2020/03/29,2阪神2,晴,10,武庫川S(3勝クラス),,9,8,9,12.3,...,**,3-3,36.3-34.8,35.7,524(-8),,,トーセンブレス,,202009020210
7,2020/01/25,1京都8,曇,11,石清水S(3勝クラス),,16,1,1,6.4,...,**,12-13,34.5-36.3,36.0,532(+6),,,ドナウデルタ,,202008010811
8,2020/01/06,1京都2,曇,10,新春S(3勝クラス),,11,8,10,5.1,...,**,2-2,37.3-34.2,34.0,526(+8),,,クライムメジャー,737.2,202008010210
9,2019/12/22,5阪神8,雨,11,サンタクロースS(3勝クラス),,13,2,2,2.7,...,**,2-2-2-2,37.5-35.1,35.4,518(-8),,,インビジブルレイズ,,201909050811


### 欠損値削除

In [20]:
df_horces_2 = []
for i in range(len(df_horces)):
    df_horces[i].dropna(how='all', axis = 1, inplace=True)
    df_horces_2.append(df_horces[i].drop(columns=['ground_score', 'time_score']))
df_horces_2[8]

Unnamed: 0,date,place,weather,R,race_name,head_count,frame_number,horce_number,odds,popularity,...,ground,time,difference,lap_time,pace,final_3F,horce_weight,winner,reward,race_id
0,2021/05/02,2阪神12,晴,11,天皇賞(春)(G1),17,1,2,3.8,2,...,良,3:15.2,0.5,6-6-5-4,35.8-37.4,37.3,474(-6),ワールドプレミア,2300.0,202109021211
1,2021/03/21,1阪神12,曇,11,阪神大賞典(G2),13,6,9,1.3,1,...,重,3:09.5,2.2,7-6-6-6,37.3-37.4,38.8,480(+2),ディープボンド,,202109011211
2,2021/01/24,1中山8,曇,11,アメリカジョッキーC(G2),17,5,9,2.4,1,...,不,2:17.9,-0.1,6-6-6-4,38.0-37.9,37.4,478(+4),(ヴェルトライゼンデ),6319.0,202106010811
3,2020/10/25,4京都6,晴,11,菊花賞(G1),18,5,9,23.0,4,...,良,3:05.5,0.0,7-7-7-4,36.8-35.6,35.1,474(+8),コントレイル,5580.8,202008040611
4,2020/09/20,2中京4,晴,9,小牧特別(2勝クラス),9,1,1,3.2,2,...,良,2:11.9,-0.1,3-3-1-1,34.6-34.6,34.6,466(0),(フライライクバード),1523.1,202007020409
5,2020/08/02,2新潟4,晴,9,出雲崎特別(1勝クラス),15,5,8,2.3,1,...,良,1:59.7,-0.4,5-5,35.2-34.5,33.2,466(+12),(アステロイドベルト),1093.6,202004020409
6,2020/05/09,2東京5,曇,11,プリンシパルS(L),11,8,10,6.3,4,...,良,2:00.9,1.1,5-5-5,36.7-33.8,34.4,454(-8),ビターエンダー,,202005020511
7,2020/03/01,1阪神2,晴,10,すみれS(L),5,1,1,3.2,2,...,良,2:12.7,0.0,3-3-3-3,36.6-35.3,35.0,462(-6),レクセランス,804.6,202009010210
8,2020/01/26,1京都9,晴,10,若駒S(L),6,1,1,6.4,4,...,良,2:02.5,0.0,5-5-5-5,36.6-36.9,35.8,468(0),ケヴィン,804.2,202008010910
9,2019/12/28,5阪神9,晴,6,2歳1勝クラス,11,5,5,7.2,5,...,良,1:48.0,0.2,3-4,37.1-34.4,34.3,468(+4),クリスティ,290.0,201909050906


### 使用する値のみに絞っていく

In [21]:
def index_serch(l, x, default=False):
    return l.index(x) if x in l else default

In [22]:
def race_score_define(row):
    rank_score = 0
    for rank in RACE_RANK:
        if (rank.rank in row['race_name']):
            rank_score = rank.score
    return rank_score

In [23]:
def race_rank_define(row):
    race_rank = ''
    for rank in RACE_RANK:
        if (rank.rank in row['race_name']):
            race_rank = rank.rank
    return race_rank

In [24]:
def place_convert(row):
    return index_serch(PLACE, row['place'], -1)

In [25]:
def weather_convert(row):
    return index_serch(WEATHER, row['weather'], -1)

In [26]:
def ground_convert(row):
    return index_serch(GROUND_STATE, row['ground'], -1)

In [27]:
def time_to_seconds (row):
    m = list(map(float, str(row['time']).split(':')))
    return float(m[0] * 60 + m[1])

In [28]:
df_horces_3 = df_horces_2.copy()
for i in range(len(df_horces_2)):
    df_horces_3[i]['ground_type'] = df_horces_3[i].apply(lambda x: x['distance'][:1], axis = 1)
    df_horces_3[i]['distance'] = df_horces_3[i].apply(lambda x: x['distance'][1:], axis = 1)
    df_horces_3[i]['place'] = df_horces_3[i].apply(lambda x: ''.join(re.findall(r"\D", x['place'])), axis = 1)
    df_horces_3[i] = df_horces_3[i].drop(columns=[ 'popularity', 'odds', 'R', 'winner', 'reward'])
    df_horces_3[i].dropna(how='any', axis = 0, inplace=True)
    df_horces_3[i]['race_score'] = df_horces_3[i].apply(race_score_define, axis = 1)
    df_horces_3[i]['race_rank'] = df_horces_3[i].apply(race_rank_define, axis = 1)

予想spread sheet用

In [29]:
spread_columns = (
    "horce_name",
    "race_name",
    "race_rank",
    "frame_number",
    "horce_number",
    "rank",
    "distance",
    "place",
    "weather",
    "ground",
    "date",
    "final_3F",
    "time"
)
delete_columns = [
    "head_count",
    "jockey",
    "weight",
    "difference",
    "lap_time",
    "pace",
    "horce_weight",
]
df_horces_spread = df_horces_3.copy()
df_horces_spread_2 = []
for i in range(len(df_horces_2)):
    df_horces_spread[i]['race_name'] = df_horces_spread[i].apply(lambda x: x['race_name'].split('(')[0], axis = 1)
    df_horces_spread[i] = df_horces_spread[i].drop(columns = delete_columns)
    df_horces_spread[i]['horce_name'] = horse_names[i]
    df_horces_spread[i]['time'] = df_horces_spread[i].apply(lambda x: str(x['time']).replace('.', ':'), axis = 1)
    df_horces_spread[i] = df_horces_spread[i].reindex(columns = spread_columns)
    df_horces_spread_2.append(df_horces_spread[i].iloc[[0]])
df_v = pd.concat(df_horces_spread_2)
file_path = CSV_DIR+ RACE + TEST_RACE_ID[0:4] + race_name[0] + "_spread.csv"
df_v.to_csv(file_path, encoding='shift_jis')
df_v

Unnamed: 0,horce_name,race_name,race_rank,frame_number,horce_number,rank,distance,place,weather,ground,date,final_3F,time
0,ユニコーンライオン,鳴尾記念,G3,3.0,3,1,2000,中京,曇,良,2021/06/05,34.1,2:00:7
0,レイパパレ,大阪杯,G1,6.0,8,1,2000,阪神,雨,重,2021/04/04,36.8,2:01:6
0,メロディーレーン,ジューンS,3勝,6.0,7,10,2400,東京,晴,良,2021/06/12,35.6,2:26:7
0,ワイプティアーズ,鳴尾記念,G3,4.0,5,5,2000,中京,曇,良,2021/06/05,34.5,2:01:6
0,アドマイヤアルバ,目黒記念,G2,2.0,3,3,2500,東京,晴,良,2021/05/30,33.0,2:33:4
0,シロニイ,天皇賞,G1,2.0,4,16,3200,阪神,晴,良,2021/05/02,40.3,3:18:4
1,クロノジェネシス,有馬記念,G1,5.0,9,1,2500,中山,晴,良,2020/12/27,36.2,2:35:0
0,カデナ,安田記念,G1,6.0,10,6,1600,東京,曇,良,2021/06/06,33.2,1:32:2
0,アリストテレス,天皇賞,G1,1.0,2,4,3200,阪神,晴,良,2021/05/02,37.3,3:15:2
0,カレンブーケドール,天皇賞,G1,2.0,3,3,3200,阪神,晴,良,2021/05/02,37.7,3:15:2


### データを数値化していく

In [30]:
print(df_horces_3[0])

          date place weather   race_name  head_count  frame_number  \
0   2021/06/05    中京       曇        鳴尾記念          13             3   
1   2021/05/16    新潟       雨         弥彦S          14             5   
2   2021/05/01    阪神       雨       ストークS          18             8   
3   2020/08/15    札幌       晴        TVh賞          14             4   
4   2020/08/01    札幌       晴        STV賞          14             1   
5   2020/04/12    阪神       雨         梅田S          16             5   
6   2020/03/29    阪神       晴        武庫川S           9             8   
7   2020/01/25    京都       曇        石清水S          16             1   
8   2020/01/06    京都       曇         新春S          11             8   
9   2019/12/22    阪神       雨    サンタクロースS          13             2   
10  2019/10/20    京都       晴         菊花賞          18             2   
11  2019/09/22    阪神      小雨       神戸新聞杯           8             2   
12  2019/07/21    函館       曇        松前特別          11             5   
13  2019/07/06    函館

In [31]:
for i in range(len(df_horces_3)):
    df_horces_3[i]['place'] = df_horces_3[i].apply(place_convert, axis = 1)
    df_horces_3[i]['ground'] = df_horces_3[i].apply(ground_convert, axis = 1)
    df_horces_3[i]['weather'] = df_horces_3[i].apply(weather_convert, axis = 1)
    df_horces_3[i]['ground_type'] = df_horces_3[i].apply(lambda x: 0 if x['ground_type'] == '芝' else 0, axis = 1)
    df_horces_3[i]['seconds'] = df_horces_3[i].apply(time_to_seconds, axis = 1)
    df_horces_3[i] = df_horces_3[i].drop(columns=[ 'race_name', 'jockey', 'date', 'lap_time', 'pace', 'race_rank', 'time'])
df_horces_3[0]

Unnamed: 0,place,weather,head_count,frame_number,horce_number,rank,weight,distance,ground,difference,final_3F,horce_weight,race_id,ground_type,race_score,seconds
0,6,1,13,3,3,1,56,2000,0,-0.6,34.1,528(-4),202107040111,0,50,120.7
1,5,4,14,5,8,1,57,1800,1,-0.2,34.8,532(+4),202104020411,0,30,108.0
2,8,4,18,8,17,3,54,1600,1,0.2,33.5,528(+4),202109021110,0,30,93.3
3,0,0,14,4,5,11,57,1700,2,2.7,39.5,524(+12),202001020111,0,30,105.1
4,0,0,14,1,1,10,55,1800,0,0.6,35.8,512(-16),202001010311,0,30,108.4
5,8,4,16,5,9,15,57,1800,2,5.9,43.1,528(+4),202009020612,0,30,117.5
6,8,0,9,8,9,9,57,1600,1,1.1,35.7,524(-8),202009020210,0,30,96.5
7,7,1,16,1,1,10,55,1400,1,0.9,36.0,532(+6),202008010811,0,30,83.4
8,7,1,11,8,10,2,56,1600,0,0.0,34.0,526(+8),202008010210,0,30,96.8
9,8,4,13,2,2,8,56,2000,1,0.4,35.4,518(-8),201909050811,0,30,121.9
