# ライン情報と選手の脚質を取得する

## 取得元: WinTicket

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# racecard_url = "https://www.winticket.jp/keirin/iwakidaira/racecard/2021011313/1/1"
# raceresult_url = "https://www.winticket.jp/keirin/yokkaichi/raceresult/2022010448/3/8"
# req = requests.get(raceresult_url)
# soup = BeautifulSoup(req.content, 'html.parser')

### 日程を与えるとその日開催されたレースのURLをリストで返す関数

In [4]:
def get_urls(year, month, day):
    url = "https://www.winticket.jp/keirin/racecard/" + str(year) + str(month).zfill(2) + str(day).zfill(2)
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    url_soup = soup.find_all('a', rel='nofollow')
    urls = [i.get('href') for i in url_soup]
    return urls

 ### urlを与えると脚質のデータフレームを返す関数

In [5]:
def create_legtype_table(url):
    race_tables = pd.read_html(url)
    legtype_table = race_tables[1][['車', '脚']].rename(columns={'車':'no', '脚':'leg'}).droplevel(1, axis=1)
    return legtype_table

### htmlを与えるとライン情報のデータフレームを返す関数

In [44]:
def create_line_table(html, number_of_people):
    line_soup = html.find_all("div", class_="sc-1y958x7-0")
    line_text = [i.text for i in line_soup][0]
    lines = line_text.split('区切り')

    line_data = [[0 for j in range(number_of_people)] for i in range(3)]
    for n, i in enumerate(lines):
        people = len(i)
        for k, j in enumerate(i):
            index = int(j) - 1
            line_data[0][index] = people
            line_data[1][index] = n
            line_data[2][index] = k
    line_data.insert(0, [i+1 for i in range(number_of_people)])
    df = pd.DataFrame({'no':line_data[0],
                       'people':line_data[1],
                       'group':line_data[2],
                       'number':line_data[3]})
    return df

### urlを与えると着順のデータフレームを返す関数

In [45]:
def create_result_table(url):
    result_tables = pd.read_html(url)
    result_table = result_tables[0][['着','車']].rename(columns={'着':'no', '車':'result'})
    return result_table

### エラー検証用

In [37]:
err_url = "https://www.winticket.jp/keirin/iwakidaira/raceresult/2021011313/1/2"
crr_url = "https://www.winticket.jp/keirin/yokkaichi/raceresult/2022010448/3/8"
result_table = pd.read_html(crr_url)
len(result_table)

2

## ここからメイン

In [8]:
import time
import traceback
from tqdm import tqdm

1. レース情報が乗っているページのURLを取得

In [20]:
race_urls = []
for year in range(2021, 2022):
    for month in range(1, 13):
        for day in tqdm(range(1, 32)):
            race_urls.extend(get_urls(year, month, day))
            time.sleep(1)

100%|██████████| 31/31 [00:42<00:00,  1.38s/it]
100%|██████████| 31/31 [00:41<00:00,  1.33s/it]
100%|██████████| 31/31 [00:42<00:00,  1.36s/it]
100%|██████████| 31/31 [00:43<00:00,  1.40s/it]
100%|██████████| 31/31 [00:41<00:00,  1.33s/it]
100%|██████████| 31/31 [00:40<00:00,  1.32s/it]
100%|██████████| 31/31 [00:41<00:00,  1.33s/it]
100%|██████████| 31/31 [00:39<00:00,  1.27s/it]
100%|██████████| 31/31 [00:42<00:00,  1.38s/it]
100%|██████████| 31/31 [00:41<00:00,  1.33s/it]
100%|██████████| 31/31 [00:41<00:00,  1.35s/it]
100%|██████████| 31/31 [00:41<00:00,  1.34s/it]


2. URLにアクセスして各種データを取得

In [51]:
# race_data_7 = pd.DataFrame()
# race_data_9 = pd.DataFrame()
race_data_7 = pd.read_pickle('../data/line_data_7.pkl')
race_data_9 = pd.read_pickle('../data/line_data_9.pkl')

for url in tqdm(race_urls):
    #レースIDの設定
    split_url = url.split('/')
    race_id = split_url[4] + "/" + split_url[5] + "/" + split_url[6]

    if (race_id in race_data_7.index) or (race_id in race_data_9.index):
        continue

    try:
        racecard_url = "https://www.winticket.jp" + url
        legtype_table = create_legtype_table(racecard_url)
        time.sleep(1)

        number_of_people = len(legtype_table)

        req = requests.get(racecard_url)
        soup = BeautifulSoup(req.content, 'html.parser')
        line_table = create_line_table(soup, number_of_people)
        time.sleep(1)

        raceresult_url = racecard_url.replace("racecard", "raceresult")
        result_table = create_result_table(raceresult_url)
        time.sleep(1)

        #dfの作成，結合
        df = pd.concat([legtype_table, line_table, result_table], axis=1)
        df = df.loc[:,~df.columns.duplicated()]

        df.index = [race_id for i in range(len(df))]

        if number_of_people == 7:
            race_data_7 = pd.concat([race_data_7, df])
        elif number_of_people == 9:
            race_data_9 = pd.concat([race_data_9, df])
        else:
            continue
    except ValueError:
        continue
    except KeyError:
        continue
    except:
        print(url)
        print(traceback.format_exc())
        break

  5%|▌         | 1334/25861 [30:10<9:14:54,  1.36s/it] 

/keirin/nagoya/racecard/2021011842/1/2
Traceback (most recent call last):
  File "/var/folders/5k/dbf5r6hd78g1f0z7svt8m25m0000gn/T/ipykernel_5524/2111927673.py", line 27, in <module>
    result_table = create_result_table(raceresult_url)
  File "/var/folders/5k/dbf5r6hd78g1f0z7svt8m25m0000gn/T/ipykernel_5524/2592039674.py", line 2, in create_result_table
    result_tables = pd.read_html(url)
  File "/opt/anaconda3/envs/keirin_prediction/lib/python3.7/site-packages/pandas/util/_decorators.py", line 299, in wrapper
    return func(*args, **kwargs)
  File "/opt/anaconda3/envs/keirin_prediction/lib/python3.7/site-packages/pandas/io/html.py", line 1100, in read_html
    displayed_only=displayed_only,
  File "/opt/anaconda3/envs/keirin_prediction/lib/python3.7/site-packages/pandas/io/html.py", line 893, in _parse
    tables = p.parse_tables()
  File "/opt/anaconda3/envs/keirin_prediction/lib/python3.7/site-packages/pandas/io/html.py", line 213, in parse_tables
    tables = self._parse_tables




3. データを保存

In [52]:
race_data_7.to_pickle('../data/line_data_7.pkl')
race_data_9.to_pickle('../data/line_data_9.pkl')