検証用・雑記用のnotebook

# モジュールのインポート

In [9]:
import pandas as pd
import glob
import os
from tqdm.notebook import tqdm

from modules.constants import LocalPaths
from modules import preparing
from modules import preprocessing
from modules import training
from modules import simulation
from modules import policies
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# horse_idマスタ検証

In [10]:
%autoreload

In [6]:
pd.read_csv('data/master/horse_id.csv')

Unnamed: 0,horse_id,encoded_id
0,2018106541,0
1,2018103708,1
2,2018103805,2
3,2018101171,3
4,2018103832,4
...,...,...
11516,2019101360,11516
11517,2019101500,11517
11518,2019104312,11518
11519,2018102219,11519


In [4]:
max(pd.read_csv('data/master/horse_id.csv')['encoded_id'])

11520

In [4]:
sample = pd.DataFrame(columns=['horse_id', 'encoded_id'])
sample

Unnamed: 0,horse_id,encoded_id


In [6]:
filename = os.path.join(LocalPaths.MASTER_DIR, 'horse_results_updated_at.csv')
pd.DataFrame(columns=['horse_id', 'updated_at']).to_csv(filename, index=None)

In [7]:
pd.read_csv(filename)

Unnamed: 0,horse_id,updated_at


In [10]:
files = glob.glob(os.path.join(LocalPaths.HTML_HORSE_DIR, '*.bin'))[:5]

In [15]:
import datetime

print(datetime.datetime.fromtimestamp(os.path.getmtime(files[0])))

2022-06-15 20:02:46.134937


In [16]:
datetime.datetime.now()

datetime.datetime(2022, 6, 23, 22, 59, 21, 759177)

In [17]:
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

'2022-06-23 23:00:44'

In [19]:
master = pd.read_csv(filename)
master

Unnamed: 0,horse_id,updated_at


In [28]:
sample = pd.DataFrame({'horse_id': ['7878', '8989'], 'updated_at': '2021'})
sample

Unnamed: 0,horse_id,updated_at
0,7878,2021
1,8989,2021


In [32]:
sample2 = pd.DataFrame({'horse_id': ['8989', '3721']})
sample2

Unnamed: 0,horse_id
0,8989
1,3721


In [35]:
merged = sample.merge(sample2, how='outer')
merged

Unnamed: 0,horse_id,updated_at
0,7878,2021.0
1,8989,2021.0
2,3721,


In [39]:
merged.loc[merged['horse_id'].isin(['7878', '3721']), :]

Unnamed: 0,horse_id,updated_at
0,7878,2021.0
2,3721,


In [1]:
os.path.dirname('data/raw/horse_results/horse_results_2021.pickle')

'data/raw/horse_results'

# ファイルセパレータを修正

In [8]:
import re

path = 'data/html/horse/2012104392.bin'
re.findall(r'horse\W(\d+).bin', path)

['2012104392']

# updated_atの確認

In [3]:
results = pd.read_pickle('data/raw/results/results_2021.pickle')
horse_id_list = results.horse_id.unique()

In [6]:
html_files_horse = preparing.scrape_html_horse_with_master(
    horse_id_list[30:35], skip=True
    )

scraping


  0%|          | 0/5 [00:00<?, ?it/s]

horse_id 2014102036 skipped
updating master


# rawテーブルの更新

In [8]:
results_new = pd.read_pickle('data/raw/results_2020.pickle')
results = pd.read_pickle(LocalPaths.RAW_RESULTS_PATH)
print('old: {}, new: {}'.format(len(results), len(results_new)))

old: 143899, new: 48225


In [9]:
preparing.update_rawdata(filepath=LocalPaths.RAW_RESULTS_PATH, new_df=results_new)
len(pd.read_pickle(LocalPaths.RAW_RESULTS_PATH))

192124

In [14]:
race_info_new = pd.read_pickle('data/raw/race_info_2020.pickle')
race_info = pd.read_pickle(LocalPaths.RAW_RACE_INFO_PATH)
print('old: {}, new: {}'.format(len(race_info), len(race_info_new)))

old: 10362, new: 3456


In [15]:
preparing.update_rawdata(filepath=LocalPaths.RAW_RACE_INFO_PATH, new_df=race_info_new)
len(pd.read_pickle(LocalPaths.RAW_RACE_INFO_PATH))

13818

In [20]:
return_tables_new = pd.read_pickle('data/raw/return_tables_2020.pickle')
return_tables = pd.read_pickle(LocalPaths.RAW_RETURN_TABLES_PATH)
print('old: {}, new: {}'.format(len(return_tables), len(return_tables_new)))

old: 82384, new: 27505


In [21]:
preparing.update_rawdata(filepath=LocalPaths.RAW_RETURN_TABLES_PATH, new_df=return_tables_new)
len(pd.read_pickle(LocalPaths.RAW_RETURN_TABLES_PATH))

109889

In [26]:
peds_new = pd.read_pickle('data/raw/peds_2019.pickle')
peds = pd.read_pickle(LocalPaths.RAW_PEDS_PATH)
print('old: {}, new: {}'.format(len(peds), len(peds_new)))

old: 20367, new: 11557


In [27]:
preparing.update_rawdata(filepath=LocalPaths.RAW_PEDS_PATH, new_df=peds_new)
len(pd.read_pickle(LocalPaths.RAW_PEDS_PATH))

21952

In [None]:
#前処理
results_processor = preprocessing.ResultsProcessor(files_results)
race_info_processor = preprocessing.RaceInfoProcessor(files_race_info)
return_processor = preprocessing.ReturnProcessor(files_return)
horse_results_processor = preprocessing.HorseResultsProcessor(files_horse_results)
peds_processor = preprocessing.PedsProcessor(files_peds)

In [11]:
### 馬の過去成績を集計しつつ、前処理の済みの全てのテーブルをマージする処理 ###

#ターゲットエンコーディング時に「馬の成績」として扱う項目
TARGET_COLS = [HorseResultsCols.RANK, HorseResultsCols.PRIZE, HorseResultsCols.RANK_DIFF, 
               'first_corner', 'final_corner',
               'first_to_rank', 'first_to_final','final_to_rank']
#horse_id列と共に、ターゲットエンコーディングの対象にする列
GROUP_COLS = ['course_len', 'race_type', HorseResultsCols.PLACE]

data_merger = preprocessing.DataMerger(
        results_processor,
        race_info_processor,
        horse_results_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
)
data_merger.merge() #処理実行

separating horse results by date


  0%|          | 0/106 [00:00<?, ?it/s]

merging horse_results


  0%|          | 0/106 [00:00<?, ?it/s]

In [12]:
# 元々のマスタをhorse_id_2.csvなどに変更して、初期時の挙動をチェック
feature_enginnering = preprocessing.FeatureEngineering(data_merger)\
    .add_interval()\
        .dumminize_ground_state()\
            .dumminize_race_type()\
                .dumminize_sex()\
                    .dumminize_weather()\
                        .encode_horse_id()\
                            .encode_jockey_id()\
                                .dumminize_kaisai()

正しくマスタができているのでOK。ファイルを元に戻す。