# `resolve_finish_times_horses.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

Prior to featurizing the horse dataset, we must add finishing times for all horses for all races. This will obviously not be a feature to the model but instead will be used to engineer other features such as average race time, best race time under similar conditions, and others.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

In [3]:
import sys
sys.path.append(f'{BASE_DIR}/utils/')

from rid_to_winning_time import RID_TO_WINNING_TIME
from rid_to_distance import RID_TO_DISTANCE

---

## Load `horses_augment.csv`

In [4]:
horses_augment_dist = pd.read_csv(f"{BASE_DIR}/data/csv/horses_augment_dist.csv", low_memory=False) 
horses_augment_dist.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,8.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66


In [5]:
horses_augment_dist.shape

(194573, 18)

In [6]:
horses_augment_times = horses_augment_dist.copy()
horses_augment_times.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,8.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66


---

## Helper Functions

The following is a function for the amount of time a "length" in a given horse race is, inspired by [https://edge.twinspires.com/racing/the-real-value-of-a-length/](https://edge.twinspires.com/racing/the-real-value-of-a-length/).

$$\text{time of length in seconds} = 1\ /\ [\ \text{distance}\ /\ \text{winning time}\ /\ \text{average horse length}\ ]$$

Note that the distance and the average horse length must have the same units to cancel out.

In [7]:
AVERAGE_HORSE_LENGTH = 2.55 # meters

In [8]:
def get_time_of_length(distance: float, winning_time: float) -> float:
    return 1 / (distance / winning_time / AVERAGE_HORSE_LENGTH)

In [9]:
get_time_of_length(1609.34, 95) # 1 mile w/ fast horses

0.15052754545341568

In [10]:
get_time_of_length(2*1609.34, 270) # 2 mile w/ moderately fast horses

0.21390756459169596

In [11]:
def get_horse_finish_time(row) -> float:
    length_time = get_time_of_length(RID_TO_DISTANCE[row['rid']], RID_TO_WINNING_TIME[row['rid']])
    return RID_TO_WINNING_TIME[row['rid']] + row['dist'] * length_time

In [12]:
horses_augment_times.iloc[0]

rid                      302858
horseName          Kings Return
age                         6.0
saddle                      4.0
decimalPrice                0.6
trainerName         W P Mullins
jockeyName            D J Casey
position                      1
positionL                   NaN
dist                        0.0
outHandicap                 0.0
RPR                       102.0
TR                    51.591987
OR                    79.654604
father              King's Ride
mother          Browne's Return
gfather                Deep Run
weight                       73
Name: 0, dtype: object

In [13]:
horses_augment_times.iloc[1]

rid                      302858
horseName        Majestic Red I
age                         6.0
saddle                      5.0
decimalPrice           0.047619
trainerName        John Hackett
jockeyName        Conor O'Dwyer
position                      2
positionL                     8
dist                        8.0
outHandicap                 0.0
RPR                        94.0
TR                    51.591987
OR                    79.654604
father                Long Pond
mother          Courtlough Lady
gfather             Giolla Mear
weight                       73
Name: 1, dtype: object

In [14]:
get_horse_finish_time(horses_augment_times.iloc[0]), get_horse_finish_time(horses_augment_times.iloc[1]) 

(277.2, 278.6799476576812)

---

## Calculate Finish Times for Horses

In [17]:
horses_augment_times['time'] = horses_augment_times.apply(get_horse_finish_time, axis=1)
horses_augment_times

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight,time
0,302858,Kings Return,6.0,4.0,0.600000,W P Mullins,D J Casey,1,,0.00,0.0,102.00000,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73,277.200000
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,8.00,0.0,94.00000,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73,278.679948
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.50,0.0,92.00000,51.591987,79.654604,Nordico,Over The Seas,North Summit,71,278.957438
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.50,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73,284.507242
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.50,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66,290.057045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194568,227139,Old Tim,6.0,5.0,0.142857,Donal Hassett,Mr B Hassett,8,15,65.00,0.0,42.00000,51.591987,79.654604,Poet's Dream I,Settled,Blue Cashmere,73,263.612842
194569,227139,Our Ling,6.0,12.0,0.111111,C P Donoghue,Philip Dempsey,9,13,78.00,0.0,24.00000,51.591987,79.654604,Vaour,May-Ling,Mississippi,73,266.195410
194570,227139,Ballinarrid,6.0,2.0,0.111111,Seamus P Murphy,Mr P Fenton,10,0.75,78.75,0.0,28.00000,51.591987,79.654604,John French,Cuckaloo,Master Buck,76,266.344405
194571,227139,Fountain Pen,9.0,3.0,0.047619,William J Fitzpatrick,Mr P Fahey,11,dist,108.75,0.0,71.87665,51.591987,79.654604,Royal Fountain,Monday's Pet,Menelek,74,272.304178


---

## Drop Unecessary Columns

Finally, we can drop `positionL` and `dist` in favor of this new column `time`.

In [18]:
horses_augment_times = horses_augment_times.drop(columns=['positionL', 'dist'])
horses_augment_times

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,TR,OR,father,mother,gfather,weight,time
0,302858,Kings Return,6.0,4.0,0.600000,W P Mullins,D J Casey,1,0.0,102.00000,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73,277.200000
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,0.0,94.00000,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73,278.679948
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,0.0,92.00000,51.591987,79.654604,Nordico,Over The Seas,North Summit,71,278.957438
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73,284.507242
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66,290.057045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194568,227139,Old Tim,6.0,5.0,0.142857,Donal Hassett,Mr B Hassett,8,0.0,42.00000,51.591987,79.654604,Poet's Dream I,Settled,Blue Cashmere,73,263.612842
194569,227139,Our Ling,6.0,12.0,0.111111,C P Donoghue,Philip Dempsey,9,0.0,24.00000,51.591987,79.654604,Vaour,May-Ling,Mississippi,73,266.195410
194570,227139,Ballinarrid,6.0,2.0,0.111111,Seamus P Murphy,Mr P Fenton,10,0.0,28.00000,51.591987,79.654604,John French,Cuckaloo,Master Buck,76,266.344405
194571,227139,Fountain Pen,9.0,3.0,0.047619,William J Fitzpatrick,Mr P Fahey,11,0.0,71.87665,51.591987,79.654604,Royal Fountain,Monday's Pet,Menelek,74,272.304178


## Save Dataframes

In [19]:
horses_augment_times.to_csv(f"{BASE_DIR}/data/csv/horses_augment_times.csv", index=False)

---