# `featurize_horses_v2.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

Continue featurization, cannot do in one notebook because requires too much memory.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `horses_featurized.csv`

In [6]:
horses_featurized = pd.read_csv(f"{BASE_DIR}/data/csv/horses_featurized.csv", low_memory=False) 
horses_featurized.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,father,mother,weight,time,avg_speed,prev_speed,father_avg_speed,mother_avg_speed,trainer_avg_speed,trainer_avg_position
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,0.0,102.0,...,King's Ride,Browne's Return,73,277.2,,,,,,
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,0.0,94.0,...,Long Pond,Courtlough Lady,73,278.679948,,,,,,
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,0.0,92.0,...,Nordico,Over The Seas,71,278.957438,,,,,,
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,0.0,71.87665,...,Roselier,Miss Reindeer,73,284.507242,,,,,,
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,0.0,71.87665,...,Noalto,Elena's Beauty,66,290.057045,,,,,,


In [7]:
horses_featurized.shape

(194573, 22)

In [80]:
horses_featurized_v2 = horses_featurized.copy()
horses_featurized_v2.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,father,mother,weight,time,avg_speed,prev_speed,father_avg_speed,mother_avg_speed,trainer_avg_speed,trainer_avg_position
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,0.0,102.0,...,King's Ride,Browne's Return,73,277.2,,,,,,
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,0.0,94.0,...,Long Pond,Courtlough Lady,73,278.679948,,,,,,
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,0.0,92.0,...,Nordico,Over The Seas,71,278.957438,,,,,,
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,0.0,71.87665,...,Roselier,Miss Reindeer,73,284.507242,,,,,,
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,0.0,71.87665,...,Noalto,Elena's Beauty,66,290.057045,,,,,,


---

## Load `races_clean_augment_clean.csv`

In [9]:
races_clean_augment_clean = pd.read_csv(f"{BASE_DIR}/data/csv/races_clean_augment_clean.csv", low_memory=False) 
races_clean_augment_clean.head()

Unnamed: 0,rid,course,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
0,302858,Thurles (IRE),277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00
1,291347,Punchestown (IRE),447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00
2,377929,Leopardstown (IRE),106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
3,275117,Curragh (IRE),125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00
4,66511,Leopardstown (IRE),116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00


In [10]:
races_clean_augment_clean.shape

(19228, 13)

---

In [11]:
def get_all_races(horse_name: str) -> pd.core.frame.DataFrame:
    df = horse_to_races[horse_name]
    if len(df) == 0:
        return pd.DataFrame()
    else:
        return df.merge(races_clean_augment_clean, how='inner', on='rid')

In [12]:
def get_prev_races(horse_name: str, rid: int) -> pd.core.frame.DataFrame:
    df = horse_to_races[horse_name]
    if len(df) <= 1:
        return pd.DataFrame()
    else:
        df = df.merge(races_clean_augment_clean, how='inner', on='rid')
        return df[df['datetime'] < df[df['rid'] == rid].iloc[0]['datetime']]

In [13]:
def get_average_speed(df) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        return np.mean(df['metric'] / df['time'])

In [27]:
def get_average_position(df) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        return np.mean(df['position'])

In [14]:
def get_prev_speed(df) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        previous_datetime = df.iloc[0]['datetime']
        previous_speed = df.iloc[0]['metric'] / df.iloc[0]['time']
        for _, row in df.iterrows():
            if row['datetime'] > previous_datetime:
                previous_datetime = row['datetime']
                previous_speed = row['metric'] / row['time']
        return previous_speed

In [34]:
def get_prev_position(df) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        previous_datetime = df.iloc[0]['datetime']
        previous_position = df.iloc[0]['position']
        for _, row in df.iterrows():
            if row['datetime'] > previous_datetime:
                previous_datetime = row['datetime']
                previous_position = row['position']
        return previous_position

In [15]:
def get_prev_trainer_races(trainer_name: str, rid: int) -> pd.core.frame.DataFrame:
    df = trainer_to_races[trainer_name]
    if len(df) <= 1:
        return pd.DataFrame()
    else:
        df = df.merge(races_clean_augment_clean, how='inner', on='rid')
        return df[df['datetime'] < df[df['rid'] == rid].iloc[0]['datetime']]

In [16]:
def get_average_position(df) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        return np.mean(df['position'])

---

## Map Horse and Race to Average Position

In [30]:
all_horse_names =  np.concatenate((horses_featurized['horseName'].unique(),
                                  horses_featurized['father'].unique(),
                                  horses_featurized['mother'].unique())
                                 )

all_horse_names = np.unique(all_horse_names)

In [31]:
horse_to_races = {}

for horse_name in tqdm(all_horse_names):
    horse_to_races[horse_name] = horses_featurized[horses_featurized['horseName'] == horse_name]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75343/75343 [13:50<00:00, 90.69it/s]


In [32]:
horse_idx_to_prev_races = {}

for idx, row in tqdm(horses_featurized.iterrows()):
    horse_idx_to_prev_races[idx] = get_prev_races(row['horseName'], row['rid'])

194573it [16:57, 191.20it/s]


In [33]:
horse_idx_to_avg_position = {}

for idx, _ in tqdm(horses_featurized.iterrows()):
    horse_idx_to_avg_position[idx] = get_average_position(horse_idx_to_prev_races[idx])

194573it [00:28, 6874.78it/s]


In [35]:
horses_featurized.sample(5)

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,father,mother,weight,time,avg_speed,prev_speed,father_avg_speed,mother_avg_speed,trainer_avg_speed,trainer_avg_position
165483,284254,Sky View,5.0,3.0,0.125,T M Walsh,R Walsh,2,0.0,47.0,...,Distant View,Counting Moonbeams,66,83.313092,14.691194,13.224925,,,14.449365,6.159574
3506,151432,Captain Doran,6.0,5.0,0.029412,S J Mahon,Danny Hand,8,0.0,63.0,...,Captain Rio,Scar Tissue,69,241.605687,,,16.406434,,14.284862,7.315663
84988,49343,Gorteo,2.0,5.0,0.333333,J P Murtagh,Johnny Murtagh,2,0.0,88.0,...,Teofilo,Gorband,57,99.158513,16.224089,16.224089,16.313886,,15.736564,5.6
67731,213969,Splendid Susie,5.0,5.0,0.076923,C Byrnes,E M Butterly,7,0.0,44.0,...,Danetime,Splendid Yankee,58,143.555131,14.22003,14.22003,,,13.181418,5.862069
61779,318980,Lady Giselle,2.0,4.0,0.066667,John J Walsh,Ronan Whelan,4,0.0,55.0,...,Indian Haven,Makena,55,97.576504,,,,14.159325,14.148594,8.041199


---

## Map Horse and Race to Most Previous Position

In [36]:
horse_idx_to_prev_position = {}

for idx, _ in tqdm(horses_featurized.iterrows()):
    horse_idx_to_prev_position[idx] = get_prev_position(horse_idx_to_prev_races[idx])

194573it [01:39, 1953.04it/s]


In [None]:
horses_featurized.sample(5)

---

## Map Horse to Jockey Average Speed

In [17]:
horses_featurized['jockeyName'].value_counts()

Pat Smullen         7923
Kevin Manning       7071
Wayne Lordan        6776
Declan McDonogh     6548
Seamie Heffernan    6162
                    ... 
Mr D Collins           1
Mr D Reddan            1
Miss L Horner          1
Holly Farrell          1
Mr P J Croke           1
Name: jockeyName, Length: 2427, dtype: int64

In [18]:
jockey_to_races = {}

for jockey_name in tqdm(horses_featurized['jockeyName'].unique()):
    jockey_to_races[jockey_name] = horses_featurized[horses_featurized['jockeyName'] == jockey_name]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2427/2427 [00:24<00:00, 99.24it/s]


In [19]:
def get_prev_jockey_races(jockey_name: str, rid: int) -> pd.core.frame.DataFrame:
    df = jockey_to_races[jockey_name]
    if len(df) <= 1:
        return pd.DataFrame()
    else:
        df = df.merge(races_clean_augment_clean, how='inner', on='rid')
        return df[df['datetime'] < df[df['rid'] == rid].iloc[0]['datetime']]

In [20]:
horse_idx_to_prev_jockey_races = {}

for idx, row in tqdm(horses_featurized.iterrows()):
    horse_idx_to_prev_jockey_races[idx] = get_prev_jockey_races(row['jockeyName'], row['rid'])

194573it [31:04, 104.37it/s]


In [21]:
horse_to_jockey_avg_speed = {}

for idx, row in tqdm(horses_featurized.iterrows()):
    prev_jockey_races = horse_idx_to_prev_jockey_races[idx]
    horse_to_jockey_avg_speed[idx] = get_average_speed(prev_jockey_races)

194573it [05:01, 645.85it/s] 


---

## Map Horse to Jockey Average Position

In [68]:
horse_to_jockey_avg_position = {}

for idx, row in tqdm(horses_featurized.iterrows()):
    prev_jockey_races = horse_idx_to_prev_jockey_races[idx]
    horse_to_jockey_avg_position[idx] = get_average_position(prev_jockey_races)

194573it [02:34, 1255.83it/s]


---

## Map Horse to Jockey Most Previous Speed

In [45]:
def get_all_races_jockey(jockey_name: str) -> pd.core.frame.DataFrame:
    df = horses_featurized[horses_featurized['jockeyName'] == jockey_name]
    if len(df) == 0:
        return pd.DataFrame()
    else:
        df = df.merge(races_clean_augment_clean, how='inner', on='rid')
        df['datetime'] = pd.to_datetime(df['datetime'])
        return df.sort_values(by='datetime')

In [51]:
sorted_jockey_races = {}

for jockey_name in tqdm(horses_featurized['jockeyName'].unique()):
    sorted_jockey_races[jockey_name] = get_all_races_jockey(jockey_name)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2427/2427 [00:44<00:00, 54.18it/s]


In [62]:
def get_prev_speed_binary(df, rid) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        
        rid_datetime = df[df['rid'] == rid].iloc[0]['datetime']
         
        lo = 0
        hi = len(df) - 1

        while lo <= hi:

            mid = lo + (hi - lo) // 2
            
            curr_datetime = df.iloc[mid]['datetime']

            if curr_datetime < rid_datetime:
                lo = mid + 1

            elif curr_datetime > rid_datetime:
                hi = mid - 1
            
            else:
                break

        return df.iloc[mid - 1]['metric'] / df.iloc[mid - 1]['time'] if mid > 0 else float('nan')

In [63]:
get_all_races_jockey('Pat Smullen').head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
5,50025,Stonehaven,2.0,9.0,0.038462,T Stack,Pat Smullen,10,0.0,93.0,...,1407.0,1,1.376904,10,14.3,1015.6,0.0,63,3723,1996-09-21 16:30:00
0,377929,Welsh Queen,3.0,5.0,0.058824,T Stack,Pat Smullen,2,0.0,86.0,...,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
1,362433,Tout A Coup,4.0,2.0,0.090909,G A Cusack,Pat Smullen,5,0.0,91.0,...,2011.0,6,1.168184,6,15.7,1010.1,0.0,71,3723,1997-06-28 16:00:00
2,271948,Dress Design,2.0,2.0,0.125,John Muldoon,Pat Smullen,4,0.0,70.0,...,1005.0,6,1.147619,5,14.8,1020.7,0.0,91,532,1997-08-04 14:20:00
3,90710,Aliya,3.0,3.0,0.25,John M Oxx,Pat Smullen,2,0.0,97.0,...,2815.0,6,1.09359,5,14.4,1020.5,3.8,93,532,1997-08-04 16:50:00


In [64]:
get_prev_speed_binary(sorted_jockey_races['Pat Smullen'], 50025)

nan

In [65]:
get_prev_speed_binary(sorted_jockey_races['Pat Smullen'], 271948)

15.413247716904765

In [66]:
get_prev_speed_binary(sorted_jockey_races['Pat Smullen'], 90710)

16.22744400947402

In [67]:
horse_to_jockey_prev_speed = {}

for idx, row in tqdm(horses_featurized.iterrows()):
    prev_jockey_races = sorted_jockey_races[row['jockeyName']]
    horse_to_jockey_prev_speed[idx] = get_prev_speed_binary(prev_jockey_races, row['rid'])

194573it [05:45, 563.11it/s]


---

## Map Horse to Jockey Most Previous Position

Too slow. Need to optimize.

In [43]:
horses_featurized['jockeyName'].value_counts()[:1000]

Pat Smullen         7923
Kevin Manning       7071
Wayne Lordan        6776
Declan McDonogh     6548
Seamie Heffernan    6162
                    ... 
Graham Gibbons         7
Mr C M Quirke          7
Katie McManmon         7
Mr W J O'Donovan       7
Colman Comerford       7
Name: jockeyName, Length: 1000, dtype: int64

In [49]:
get_all_races_jockey('Pat Smullen').head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
5,50025,Stonehaven,2.0,9.0,0.038462,T Stack,Pat Smullen,10,0.0,93.0,...,1407.0,1,1.376904,10,14.3,1015.6,0.0,63,3723,1996-09-21 16:30:00
0,377929,Welsh Queen,3.0,5.0,0.058824,T Stack,Pat Smullen,2,0.0,86.0,...,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
1,362433,Tout A Coup,4.0,2.0,0.090909,G A Cusack,Pat Smullen,5,0.0,91.0,...,2011.0,6,1.168184,6,15.7,1010.1,0.0,71,3723,1997-06-28 16:00:00
2,271948,Dress Design,2.0,2.0,0.125,John Muldoon,Pat Smullen,4,0.0,70.0,...,1005.0,6,1.147619,5,14.8,1020.7,0.0,91,532,1997-08-04 14:20:00
3,90710,Aliya,3.0,3.0,0.25,John M Oxx,Pat Smullen,2,0.0,97.0,...,2815.0,6,1.09359,5,14.4,1020.5,3.8,93,532,1997-08-04 16:50:00


In [54]:
def get_prev_position_binary(df, rid) -> float:
    if len(df) == 0:
        return float('nan')
    else:
        
        rid_datetime = df[df['rid'] == rid].iloc[0]['datetime']
         
        lo = 0
        hi = len(df) - 1

        while lo <= hi:

            mid = lo + (hi - lo) // 2
            
            curr_datetime = df.iloc[mid]['datetime']

            if curr_datetime < rid_datetime:
                lo = mid + 1

            elif curr_datetime > rid_datetime:
                hi = mid - 1
            
            else:
                break

        return df.iloc[mid - 1]['position'] if mid > 0 else float('nan')

In [57]:
get_prev_position_binary(sorted_jockey_races['Pat Smullen'], 50025)

nan

In [55]:
get_prev_position_binary(sorted_jockey_races['Pat Smullen'], 271948)

5

In [56]:
get_prev_position_binary(sorted_jockey_races['Pat Smullen'], 90710)

4

In [59]:
horse_to_jockey_prev_position = {}

for idx, row in tqdm(horses_featurized.iterrows()):
    prev_jockey_races = sorted_jockey_races[row['jockeyName']]
    horse_to_jockey_prev_position[idx] = get_prev_position_binary(prev_jockey_races, row['rid'])

194573it [06:10, 525.84it/s]


---

## Checkpoint `horses_featurized`

In [69]:
rename_cols = {
    0: 'avg_position',
}

df_avg_position = pd.DataFrame.from_dict(horse_idx_to_avg_position, orient='index').rename(columns=rename_cols)
df_avg_position.sample(5)

Unnamed: 0,avg_position
122662,6.727273
58066,4.166667
94743,
34725,
82860,7.0


In [70]:
rename_cols = {
    0: 'prev_position',
}

df_prev_position = pd.DataFrame.from_dict(horse_idx_to_prev_position, orient='index').rename(columns=rename_cols)
df_prev_position.sample(5)

Unnamed: 0,prev_position
32493,
183892,
100099,5.0
98666,1.0
86535,6.0


In [71]:
rename_cols = {
    0: 'jockey_avg_speed',
}

df_jockey_avg_speed = pd.DataFrame.from_dict(horse_to_jockey_avg_speed, orient='index').rename(columns=rename_cols)
df_jockey_avg_speed.sample(5)

Unnamed: 0,jockey_avg_speed
26969,15.31328
28204,15.203332
185001,12.885763
133907,13.497192
129139,15.371341


In [72]:
rename_cols = {
    0: 'jockey_avg_position',
}

df_jockey_avg_position = pd.DataFrame.from_dict(horse_to_jockey_avg_position, orient='index').rename(columns=rename_cols)
df_jockey_avg_position.sample(5)

Unnamed: 0,jockey_avg_position
164461,8.886364
34636,5.533088
116971,6.004912
31728,7.348723
17300,6.108085


In [73]:
rename_cols = {
    0: 'jockey_prev_speed',
}

df_jockey_prev_speed = pd.DataFrame.from_dict(horse_to_jockey_prev_speed, orient='index').rename(columns=rename_cols)
df_jockey_prev_speed.sample(5)

Unnamed: 0,jockey_prev_speed
31251,12.846611
44835,15.643399
134891,16.841051
31783,16.528347
71711,17.044341


In [74]:
rename_cols = {
    0: 'jockey_prev_position',
}

df_jockey_prev_position = pd.DataFrame.from_dict(horse_to_jockey_prev_position, orient='index').rename(columns=rename_cols)
df_jockey_prev_position.sample(5)

Unnamed: 0,jockey_prev_position
118198,5.0
169548,11.0
20925,6.0
94224,8.0
181896,2.0


In [81]:
horses_featurized_v2 = horses_featurized_v2.join(df_avg_position) \
                                           .join(df_prev_position) \
                                           .join(df_jockey_avg_speed) \
                                           .join(df_jockey_avg_position) \
                                           .join(df_jockey_prev_speed) \
                                           .join(df_jockey_prev_position)

In [82]:
horses_featurized_v2.sample(10)

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,outHandicap,RPR,...,father_avg_speed,mother_avg_speed,trainer_avg_speed,trainer_avg_position,avg_position,prev_position,jockey_avg_speed,jockey_avg_position,jockey_prev_speed,jockey_prev_position
159304,22697,Tuff Love,3.0,3.0,0.125,G M Lyons,Gary Carroll,7,0.0,52.0,...,15.94767,,15.684794,4.967614,2.666667,4.0,15.506673,6.163306,15.372178,7.0
193916,357901,Welsh Wind,3.0,7.0,0.076923,D Hanley,P J Scallan,7,0.0,92.0,...,,,15.071624,5.084211,3.4,3.0,14.906481,5.152381,14.99534,1.0
81755,303,Crofton Trail,4.0,6.0,0.266667,Mrs D A Love,Mr S Crawford,4,0.0,61.0,...,,,13.226141,6.775,,,14.106637,6.823009,13.100245,5.0
1211,86801,Boatrace,3.0,4.0,0.083333,Daniel Miley,Gary Carroll,4,0.0,49.0,...,,,14.70826,8.102564,13.0,13.0,15.500025,6.154573,16.39669,1.0
44489,406717,Crashdaparty,3.0,4.0,0.012346,D K Weld,Andrew Slattery,10,0.0,22.0,...,,,15.226054,4.734288,9.5,7.0,15.5706,6.558824,15.824581,5.0
106599,350029,Sweetest Of Peas,3.0,12.0,0.090909,David Wachman,Johnny Murtagh,6,0.0,54.0,...,,14.956493,15.363175,5.432824,,,15.482456,4.159776,14.946389,3.0
173432,258135,Arctic Force,5.0,3.0,0.333333,Paul A Roche,Mr D Roche,1,0.0,98.0,...,,,14.10123,5.642857,,,13.192602,4.368421,13.919187,9.0
173245,345569,Roisin's Star,2.0,12.0,0.090909,G M Lyons,Pat Cosgrave,3,0.0,72.0,...,,,15.062772,6.498328,9.0,9.0,15.039002,6.352941,14.62361,1.0
41988,402850,Duncannon Power,3.0,12.0,0.047619,Joseph Patrick O'Brien,Mikey Sheehy,13,0.0,34.0,...,15.145318,,15.283014,5.16895,,,15.583936,5.878049,15.460287,8.0
176599,158513,Flowerhill Nova,3.0,6.0,0.090909,Andrew Slattery,Declan McDonogh,6,0.0,69.0,...,,,15.230608,6.81087,5.0,5.0,15.385195,5.525289,13.982249,3.0


In [83]:
horses_featurized_v2.shape

(194573, 28)

---

## Save Dataframes

In [84]:
horses_featurized_v2.to_csv(f"{BASE_DIR}/data/csv/horses_featurized_v2.csv", index=False)

---