## データ収集

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import tqdm.notebook as tqdm
import re
import traceback

### 日付ごとのURLを取得

In [3]:
def createURL(month, day):
    url = 'https://keirin.kdreams.jp/racecard/2021/' + str(month).zfill(2) + '/' + str(day).zfill(2) + '/'
    return url

In [11]:
seedURL = [ createURL(i, j) for i in range(4, 5, 1) for j in range(1, 2, 1)]

### 各レースのURLを取得

In [12]:
def get_race_id(sourceURLs):
    race_urls = {}
    for sourceURL in tqdm.tqdm(sourceURLs):
        try:
            req = requests.get(sourceURL)
            soup = BeautifulSoup(req.content, 'html.parser')
            time.sleep(1)
            race_html = soup.find_all('a', class_='JS_POST_THROW')
            for html in race_html:
                url = html.get('href')
                if 'racedetail' in url:
                    race_id = re.sub(r'\D', '', url)
                    race_urls[race_id] = url
        except:
            break
    return race_urls

In [13]:
race_urls = get_race_id(seedURL)

  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
print(len(race_urls))

54


### レース情報の取得

In [15]:
main_colum = ['予想', '好気合', '総評', '枠番', '車番', '選手名府県/年齢/期別', '級班', '脚質', 'ギヤ倍数', '競走得点', '1着', '2着', '3着', '着外']
result_colum = ['予想', '着順', '車番', '選手名', '着差', '上り', '決まり手', 'S/B', '勝敗因']

In [16]:
def scrape_race_result(race_urls, pre_race_results={}):
    race_results = pre_race_results
    for race_id, url in tqdm.tqdm(race_urls.items()):
        if race_id in race_results.keys():
            continue       
        try:
            main = pd.read_html(url)
            df = main[4][:-1]
            df.columns = main_colum
            result_table = main[-2]
            result_table.columns = result_colum
            df_result = result_table.loc[ : , ['着順', '車番']]
            df = df.astype(str)
            df_result = df_result.astype(str)
            df = pd.merge(df_result, df, on='車番', how='left')
            race_results[race_id] = df
            time.sleep(1)
        except IndexError:
            print('IndexError: {}', url)
            continue
        except KeyError:
            print('keyerror: {}', url)
            continue
        except ValueError:
            print("ValueError: {}", url)
            continue
        except :
            traceback.print_exc()
            break
    return race_results

In [18]:
results = scrape_race_result(race_urls)

  0%|          | 0/54 [00:00<?, ?it/s]

In [21]:
results["2720210401010001"]

Unnamed: 0,着順,車番,予想,好気合,総評,枠番,選手名府県/年齢/期別,級班,脚質,ギヤ倍数,競走得点,1着,2着,3着,着外
0,1,3,◎,,1,3,鰐淵 圭佑 群　馬/37/90,A3,追,3.92,78.33,9,8,6,3
1,2,2,×,,3,2,岡崎 徹 神奈川/49/70,A3,追,3.86,72.77,4,5,4,14
2,3,1,○,,3,1,中山 健 新　潟/37/89,A3,両,3.92,73.66,16,15,7,30
3,4,5,注,,3,5,角口 聖也 千　葉/32/94,A3,逃,3.92,70.5,3,6,10,29
4,5,4,▲,,5,4,志村 正洋 神奈川/46/77,A3,追,3.85,67.45,1,5,10,26
5,6,6,△,,5,6,茨木 基成 東　京/58/60,A3,追,3.92,65.76,1,6,9,32


### キーを行名にしたデータフレームにし，結合

In [89]:
for key in results.keys():
    results[key].index = [key]*len(results[key])

In [90]:
race_results = pd.concat([results[key] for key in results.keys()], sort=False)

In [91]:
race_results

Unnamed: 0,着順,車番,予想,好気合,総評,枠番,選手名府県/年齢/期別,級班,脚質,ギヤ倍数,競走得点,1着,2着,3着,着外
2720210401010001,1,3,◎,,1,3,鰐淵 圭佑 群　馬/37/90,A3,追,3.92,78.33,9,8,6,3
2720210401010001,2,2,×,,3,2,岡崎 徹 神奈川/49/70,A3,追,3.86,72.77,4,5,4,14
2720210401010001,3,1,○,,3,1,中山 健 新　潟/37/89,A3,両,3.92,73.66,16,15,7,30
2720210401010001,4,5,注,,3,5,角口 聖也 千　葉/32/94,A3,逃,3.92,70.50,3,6,10,29
2720210401010001,5,4,▲,,5,4,志村 正洋 神奈川/46/77,A3,追,3.85,67.45,1,5,10,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8320210728020012,3,6,,,5,6,中村 昌弘 広　島/44/81,S2,追,3.92,98.90,3,12,12,36
8320210728020012,4,2,×,,2,2,村上 博幸 京　都/42/86,S1,追,3.92,110.62,14,3,5,33
8320210728020012,5,4,注,,4,4,松岡 篤哉 岐　阜/39/97,S2,逃,3.92,101.88,15,10,7,39
8320210728020012,6,3,▲,,4,3,吉村 和之 岐　阜/44/80,S2,追,3.93,100.15,3,8,8,33


### pickleファイルに保存

In [93]:
race_results.to_pickle("data/race_data2.pkl")