# `append_race_data_to_paired_input.ipynb`

### Author: Anthony Hein

#### Last updated: 11/14/2021

# Overview:

Append race data to the dataset of paired input.

---

## Setup

In [2]:
from datetime import datetime
import git
import os
import re
from typing import List
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_theme(style="whitegrid")

In [3]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `horses_featurized_jockey_paired_input.csv`

In [4]:
horses_paired_input = pd.read_csv(f"{BASE_DIR}/data/streamline/horses_featurized_jockey_paired_input.csv",
                                  low_memory=False)
horses_paired_input.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0
1,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0
2,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237
3,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,4.0,1.000634,1.0,1.004078,1.0,8.0,2.0,1.0,1.029073,1.001057
4,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0


In [5]:
horses_paired_input.shape

(1143824, 167)

---

## Load `races_featurized.csv`

In [6]:
races_featurized = pd.read_csv(f"{BASE_DIR}/data/streamline/races_featurized.csv",
                                  low_memory=False)
races_featurized.head()

Unnamed: 0,rid,course,title,winningTime,metric,ncond,class,runners,margin,1st_place_rank_in_odds,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,302858,Thurles,Liffey Maiden Hurdle (Div 1),277.2,3821.0,1,0,6,1.219263,1,...,0,1,0,0,0,0,0,0,1,1.350363
1,291347,Punchestown,Ericsson G.S.M. Grand National Trial Handicap ...,447.2,5229.0,5,0,9,1.218049,4,...,1,0,0,1,0,0,0,0,1,1.995151
2,75447,Listowel,Ballybunion E.B.F. Beginners S'chase,318.4,3620.0,5,0,8,1.27732,3,...,1,0,0,0,0,0,0,1,0,1.786421
3,358038,Punchestown,Quinns Of Baltinglass Chase (La Touche) (Cross...,533.9,6637.0,1,0,10,1.286595,1,...,0,1,0,0,0,0,0,1,0,1.611062
4,89211,Tipperary,Topaz Sprint Stakes (Listed),59.9,1005.0,4,0,5,1.217043,4,...,1,0,0,0,0,0,1,0,0,1.254374


In [7]:
races_featurized.shape

(20201, 129)

---

## Concatenate Dataframes

In [9]:
horses_paired_input_with_race_data = horses_paired_input.merge(races_featurized, on='rid')
horses_paired_input_with_race_data.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1,0,0,0,0,0,0,1,0,1.173658
1,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1,0,0,0,0,0,0,1,0,1.173658
2,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1,0,0,0,0,0,0,1,0,1.173658
3,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,1,0,0,0,0,0,0,1,0,1.173658
4,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,1,0,0,0,0,0,0,1,0,1.173658


In [11]:
races_featurized[races_featurized['rid'] == 377929]

Unnamed: 0,rid,course,title,winningTime,metric,ncond,class,runners,margin,1st_place_rank_in_odds,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
6,377929,Leopardstown,"Derrinstown Stud 1,000 Guineas Trial (Listed)",106.4,1609.0,4,0,5,1.204927,1,...,1,0,0,0,0,0,0,1,0,1.173658


In [10]:
horses_paired_input_with_race_data.shape

(1143824, 295)

---

## Save Dataframe

In [12]:
horses_paired_input_with_race_data.to_csv(
    f"{BASE_DIR}/data/streamline/horses_featurized_jockey_paired_input_with_race_data.csv",
    index=False
)

---