# `change_dist_horse.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

The current `dist` column within our horses dataset calculates the distance to a placing horse, not the winning horse. That is, it is the distance to the horse in second place. Since our goal is to convert distance information into raw race times, this is not helpful and we instead want to know the distance to the horse in first (the only one for which we know the racing time). To do so, we will recalculate the distance to first place and change the current `dist` variable to reflect this.

---

## Setup

In [14]:
from datetime import datetime
import git
import os
import math
import re
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

In [10]:
import sys
sys.path.append(f'{BASE_DIR}/utils/')

from length_abbrv_to_dist import LENGTH_ABBRV_TO_DIST

---

## Load `horses_augment.csv`

In [3]:
horses_augment = pd.read_csv(f"{BASE_DIR}/data/csv/horses_augment.csv", low_memory=False) 
horses_augment.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,0.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66


In [4]:
horses_augment.shape

(194898, 18)

In [6]:
horses_augment_dist = horses_augment.copy()
horses_augment_dist.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,0.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66


---

## Helper Function

In [15]:
def get_dist_of_horse(df: pd.core.frame.DataFrame, idx: int) -> float:
    """
    Given a dataframe `df` that represents a race, calculate the `dist`
    of the horse with identifier `idx`.
    """
    
    df = df.sort_values(by=['position'])
    
    # cumulative distance to the runner in 1st place
    dist = 0
    
    # there may be ties for a position
    curr_position = 1
    
    # for all entries in this race
    for _, row in df.iterrows():
        
        # convert `positionL` to a numerical value,
        # it is safe to assume that the length is 0 where `positionL = NaN`, see prior discussion
        if row['positionL'] and (isinstance(row['positionL'], str) or not math.isnan(row['positionL'])):
            lengths = row['positionL']
        else:
            lengths = 0
        
        # some `positionL` values are strings encoding length information
        try:
            lengths = float(lengths)
        except:
            lengths = LENGTH_ABBRV_TO_DIST[lengths]
            
        # be careful about changing the current position bc of ties
        curr_position = row['position']
       
        # accumulate distance so long as we are not looking at nonfinishing horses
        if curr_position != 40:
            dist += lengths
        
        # found the desired horse
        if int(row.name) == idx:
            return dist + (LENGTH_ABBRV_TO_DIST['dist'] if curr_position == 40 else 0)

In [22]:
df = horses_augment_dist[horses_augment_dist['rid'] == 302858]
df

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,8.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66
5,302858,Graignamanagh,6.0,3.0,0.307692,Harry De Bromhead,J R Barry,40,,99.5,0.0,71.87665,51.591987,79.654604,Tremblant,Feathermore,Crash Course,73


In [23]:
for idx, entry in tqdm(df.iterrows()):
    print(get_dist_of_horse(df, idx))

6it [00:00, 1326.54it/s]

0.0
8.0
9.5
39.5
69.5
99.5





In [26]:
df = horses_augment_dist[horses_augment_dist['rid'] == 89607]
df

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
182407,89607,Valentana,4.0,3.0,0.25,W McCreery,W J Lee,1,,0.0,0.0,86.0,76.0,73.0,Haatef,Miss Latina,Mozart,62
182408,89607,Gentil J,3.0,12.0,0.058824,H Rogers,Shane Foley,2,2,2.0,0.0,75.0,63.0,69.0,Jeremy,Lady Pitrizza,Night Shift,58
182409,89607,Ducky Mallon,5.0,1.0,0.166667,Donal Kinsella,Ronan Whelan,3,3,5.0,0.0,75.0,61.0,73.0,Jeremy,Indus Ridge,Indian Ridge,62
182410,89607,Snoozing Indian,4.0,4.0,0.047619,T J O'Mara,Wayne Lordan,4,.5,5.5,0.0,70.0,58.0,72.0,Sleeping Indian,Balnaha,Lomond,62
182411,89607,No Approval,3.0,13.0,0.090909,Kevin Prendergast,Chris Hayes,5,.5,6.0,0.0,64.0,50.0,69.0,Approve,Night Cam,Night Shift,58
182412,89607,Deeds Not Words,5.0,6.0,0.111111,J F Levins,Donagh O'Connor,6,.5,6.5,0.0,65.0,55.0,70.0,Royal Applause,Wars,Green Desert,58
182413,89607,Elusive Approach,4.0,2.0,0.133333,J S Bolger,Kevin Manning,7,.75,7.25,0.0,66.0,53.0,73.0,New Approach,Soilse Na Cathrach,Elusive City,62
182414,89607,Cairdiuil,10.0,5.0,0.076923,I Madden,Seamie Heffernan,8,1.25,8.5,0.0,61.0,47.0,71.0,Bachelor Duke,Lilabelle,Lil's Boy,61
182415,89607,Sister Slew,6.0,11.0,0.066667,Shane Nolan,Robbie Downey,9,1.75,10.25,0.0,52.0,38.0,67.0,Kheleyf,Capote West,Capote,58
182416,89607,Mzuri,4.0,9.0,0.1,Ms Sheila Lavery,Pat Smullen,10,nk,10.5,0.0,54.0,39.0,69.0,Tagula,Meadow,Green Desert,60


In [27]:
for idx, entry in tqdm(df.iterrows()):
    print(get_dist_of_horse(df, idx))

14it [00:00, 889.85it/s]

0.0
2.0
5.0
5.5
6.0
6.5
7.25
8.5
10.25
10.5
15.25
16.0
17.75
35.75





In [19]:
for idx, entry in tqdm(horses_augment_dist.iterrows()):
    df = horses_augment_dist[horses_augment_dist['rid'] == entry['rid']]
    dist = get_dist_of_horse(df, idx)
    horses_augment_dist.at[idx, 'dist'] = dist

194898it [03:30, 927.54it/s] 


---

## Save Dataframes

In [28]:
horses_augment_dist.to_csv(f"{BASE_DIR}/data/csv/horses_augment_dist.csv", index=False)

---