# `featurize_horses_v3.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

Continue featurization, cannot do in one notebook because requires too much memory.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `horses_featurized.csv`

In [44]:
horses_featurized_v2 = pd.read_csv(f"{BASE_DIR}/data/csv/horses_featurized_v2.csv", low_memory=False) 
horses_featurized_v2.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,father_avg_speed,mother_avg_speed,trainer_avg_speed,trainer_avg_position,avg_position,prev_position,jockey_avg_speed,jockey_avg_position,jockey_prev_speed,jockey_prev_position
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,0.0,102.0,...,,,,,,,,,,
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,0.0,94.0,...,,,,,,,,,,
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,0.0,92.0,...,,,,,,,,,,
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,0.0,71.87665,...,,,,,,,,,,
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,0.0,71.87665,...,,,,,,,,,,


In [45]:
horses_featurized_v2.shape

(194573, 28)

In [46]:
horses_featurized_v3 = horses_featurized_v2.copy()
horses_featurized_v3.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,father_avg_speed,mother_avg_speed,trainer_avg_speed,trainer_avg_position,avg_position,prev_position,jockey_avg_speed,jockey_avg_position,jockey_prev_speed,jockey_prev_position
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,0.0,102.0,...,,,,,,,,,,
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,0.0,94.0,...,,,,,,,,,,
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,0.0,92.0,...,,,,,,,,,,
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,0.0,71.87665,...,,,,,,,,,,
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,0.0,71.87665,...,,,,,,,,,,


---

## Load `races_clean_augment_clean.csv`

In [7]:
races_clean_augment_clean = pd.read_csv(f"{BASE_DIR}/data/csv/races_clean_augment_clean.csv", low_memory=False) 
races_clean_augment_clean.head()

Unnamed: 0,rid,course,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
0,302858,Thurles (IRE),277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00
1,291347,Punchestown (IRE),447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00
2,377929,Leopardstown (IRE),106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
3,275117,Curragh (IRE),125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00
4,66511,Leopardstown (IRE),116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00


In [8]:
races_clean_augment_clean.shape

(19228, 13)

---

In [25]:
def get_average_speed(df) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        return np.mean(df['metric'] / df['time'])

In [26]:
def get_average_position(df) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        return np.mean(df['position'])

In [27]:
def get_prev_trainer_races(trainer_name: str, rid: int) -> pd.core.frame.DataFrame:
    df = trainer_to_races[trainer_name]
    if len(df) <= 1:
        return pd.DataFrame()
    else:
        df = df.merge(races_clean_augment_clean, how='inner', on='rid')
        return df[df['datetime'] < df[df['rid'] == rid].iloc[0]['datetime']]

In [28]:
def get_prev_position_binary(df, rid) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        
        rid_datetime = df[df['rid'] == rid].iloc[0]['datetime']
         
        lo = 0
        hi = len(df) - 1

        while lo <= hi:

            mid = lo + (hi - lo) // 2
            
            curr_datetime = df.iloc[mid]['datetime']

            if curr_datetime < rid_datetime:
                lo = mid + 1

            elif curr_datetime > rid_datetime:
                hi = mid - 1
            
            else:
                break

        return df.iloc[mid - 1]['position'] if mid > 0 else float('nan')

In [29]:
def get_prev_speed_binary(df, rid) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        
        rid_datetime = df[df['rid'] == rid].iloc[0]['datetime']
         
        lo = 0
        hi = len(df) - 1

        while lo <= hi:

            mid = lo + (hi - lo) // 2
            
            curr_datetime = df.iloc[mid]['datetime']

            if curr_datetime < rid_datetime:
                lo = mid + 1

            elif curr_datetime > rid_datetime:
                hi = mid - 1
            
            else:
                break

        return df.iloc[mid - 1]['metric'] / df.iloc[mid - 1]['time'] if mid > 0 else float('nan')

---

## Map Horse to Trainer Most Previous Speed

In [22]:
horses_featurized_v2['trainerName'].value_counts()[:500]

A P O'Brien         7696
D K Weld            7194
J S Bolger          6963
M Halford           5284
G M Lyons           4616
                    ... 
Patrick Lacey         40
Ms Sandra Hughes      40
L Byrne               40
C Wilkinson           40
J P O'Keeffe          40
Name: trainerName, Length: 500, dtype: int64

In [19]:
def get_all_races_trainer(trainer_name: str) -> pd.core.frame.DataFrame:
    df = horses_featurized_v2[horses_featurized_v2['trainerName'] == trainer_name]
    if len(df) == 0:
        return pd.DataFrame()
    else:
        df = df.merge(races_clean_augment_clean, how='inner', on='rid')
        df['datetime'] = pd.to_datetime(df['datetime'])
        return df.sort_values(by='datetime')

In [20]:
sorted_trainer_races = {}

for trainer_name in tqdm(horses_featurized_v2['trainerName'].unique()):
    sorted_trainer_races[trainer_name] = get_all_races_trainer(trainer_name)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1650/1650 [00:26<00:00, 61.78it/s]


In [30]:
get_all_races_trainer('Patrick Lacey').head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
32,347295,Dip's Guest,4.0,2.0,0.058824,Patrick Lacey,Fran Berry,5,0.0,38.0,...,2815.0,1,1.187376,8,15.4,1020.7,0.0,68,518,1998-08-24 20:05:00
33,306437,Dip's Guest,4.0,7.0,0.285714,Patrick Lacey,Charlie Swan,1,0.0,91.0,...,3419.0,1,1.167938,9,20.2,1019.8,0.0,56,518,1998-08-29 14:35:00
34,231817,Dip's Guest,4.0,12.0,0.066667,Patrick Lacey,Mr J O'Hanlon,6,0.0,33.0,...,3218.0,9,1.410961,11,8.8,1009.0,0.0,74,4935,1998-10-26 14:55:00
35,288691,Mister Cheer,3.0,7.0,0.029412,Patrick Lacey,Fran Berry,14,0.0,1.0,...,1206.0,2,1.420869,14,14.0,1015.5,0.0,72,3904,1999-06-29 18:00:00
36,82221,Mister Cheer,3.0,4.0,0.019608,Patrick Lacey,J E Casey,9,0.0,71.87665,...,1407.0,2,1.212307,9,15.2,1024.1,0.0,63,4935,1999-07-27 20:00:00


In [31]:
get_prev_speed_binary(sorted_trainer_races['Patrick Lacey'], 347295)

nan

In [32]:
get_prev_speed_binary(sorted_trainer_races['Patrick Lacey'], 306437)

13.826689841709138

In [33]:
get_prev_speed_binary(sorted_trainer_races['Patrick Lacey'], 231817)

14.041067761806982

In [35]:
horse_to_trainer_prev_speed = {}

for idx, row in tqdm(horses_featurized_v2.iterrows()):
    prev_trainer_races = sorted_trainer_races[row['trainerName']]
    horse_to_trainer_prev_speed[idx] = get_prev_speed_binary(prev_trainer_races, row['rid'])

194573it [05:41, 570.24it/s]


---

## Map Horse to Trainer Most Previous Position

In [36]:
get_prev_position_binary(sorted_trainer_races['Patrick Lacey'], 347295)

nan

In [37]:
get_prev_position_binary(sorted_trainer_races['Patrick Lacey'], 306437)

5

In [38]:
get_prev_position_binary(sorted_trainer_races['Patrick Lacey'], 231817)

1

In [39]:
horse_to_trainer_prev_position = {}

for idx, row in tqdm(horses_featurized_v2.iterrows()):
    prev_trainer_races = sorted_trainer_races[row['trainerName']]
    horse_to_trainer_prev_position[idx] = get_prev_position_binary(prev_trainer_races, row['rid'])

194573it [05:06, 634.05it/s]


---

## Checkpoint `horses_featurized`

In [47]:
rename_cols = {
    0: 'trainer_prev_speed',
}

df_trainer_prev_speed = pd.DataFrame.from_dict(horse_to_trainer_prev_speed, orient='index').rename(columns=rename_cols)
df_trainer_prev_speed.sample(5)

Unnamed: 0,trainer_prev_speed
192255,16.604747
36899,16.397398
31636,15.445918
118470,13.998131
103680,15.52895


In [48]:
rename_cols = {
    0: 'trainer_prev_position',
}

df_trainer_prev_position = pd.DataFrame.from_dict(horse_to_trainer_prev_position, orient='index').rename(columns=rename_cols)
df_trainer_prev_position.sample(5)

Unnamed: 0,trainer_prev_position
148736,4.0
173744,1.0
37918,12.0
37446,5.0
20661,4.0


In [49]:
horses_featurized_v3 = horses_featurized_v3.join(df_trainer_prev_speed) \
                                           .join(df_trainer_prev_position)

In [50]:
horses_featurized_v3.sample(10)

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,trainer_avg_speed,trainer_avg_position,avg_position,prev_position,jockey_avg_speed,jockey_avg_position,jockey_prev_speed,jockey_prev_position,trainer_prev_speed,trainer_prev_position
126962,185905,Reckless Lad,4.0,6.0,0.142857,Patrick Martin,Conor Hoban,3,0.0,75.0,...,15.244656,6.789181,3.0,1.0,15.546661,6.514429,15.507608,5.0,15.340554,6.0
94361,206453,Final Opinion,5.0,3.0,0.047619,Declan Gillespie,Davy Condon,4,0.0,79.0,...,15.005777,5.918719,4.2,8.0,15.032847,6.703797,16.420609,7.0,16.594657,3.0
80003,35344,Mississippi John,3.0,8.0,0.058824,G M Lyons,Gary Carroll,10,0.0,24.0,...,15.693452,5.303017,7.8,4.0,15.535219,6.262282,16.297014,11.0,16.313352,4.0
148767,315414,Touchwoodexpress,5.0,12.0,0.222222,Paul Nolan,Mr A Fitzgerald,4,0.0,89.0,...,13.511881,7.056338,,,12.808601,5.775281,13.458804,1.0,14.715963,10.0
104222,390087,Macnas,4.0,13.0,0.058824,John Joseph Murphy,Marc Monaghan,4,0.0,65.0,...,14.737579,7.323133,4.833333,7.0,15.652317,7.690476,15.412892,13.0,15.738064,5.0
77877,325470,Pelican Waters,3.0,3.0,0.142857,Mrs John Harrington,Fran Berry,6,0.0,86.0,...,14.473813,6.830835,5.714286,3.0,15.156899,5.706398,16.270912,1.0,14.046268,1.0
58783,186399,Lift The Gloom,6.0,4.0,0.230769,Noel Lawlor,Rory Cleary,2,0.0,64.0,...,15.471457,7.988095,6.222222,2.0,15.230877,7.128869,14.142396,10.0,15.869178,11.0
99217,282857,Krivan,3.0,6.0,0.047619,M Halford,Shane Foley,3,0.0,74.0,...,15.336099,5.87815,,,15.39664,6.317889,15.868159,5.0,15.868159,5.0
104847,199505,Heavy Duty,5.0,10.0,0.02439,S Buggy,Ms K Walsh,9,0.0,97.0,...,,,,,13.42205,5.045455,12.561556,9.0,,
128348,90775,Michaelmas,3.0,8.0,0.142857,A P O'Brien,Joseph O'Brien,7,0.0,95.0,...,15.467009,4.038976,1.666667,2.0,15.454437,4.742435,16.312287,1.0,15.724399,11.0


In [51]:
horses_featurized_v3.shape

(194573, 30)

---

## Save Dataframes

In [52]:
horses_featurized_v3.to_csv(f"{BASE_DIR}/data/csv/horses_featurized_v3.csv", index=False)

---