# `making_paired_input.ipynb`

### Author: Anthony Hein

#### Last updated: 11/8/2021

# Overview:

This notebook pairs the input.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_theme(style="whitegrid")

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `horses_featurized_horse_jockey.csv`

In [3]:
horses_featurized = pd.read_csv(f"{BASE_DIR}/data/streamline/horses_featurized_horse_jockey.csv", low_memory=False)
horses_featurized.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,jockey_prev_3_position_rain,jockey_prev_1_finishing_time_ratio_rain,jockey_prev_2_finishing_time_ratio_rain,jockey_prev_3_finishing_time_ratio_rain,jockey_prev_1_position_rhum,jockey_prev_2_position_rhum,jockey_prev_3_position_rhum,jockey_prev_1_finishing_time_ratio_rhum,jockey_prev_2_finishing_time_ratio_rhum,jockey_prev_3_finishing_time_ratio_rhum
0,302858,Kings Return,6.0,4.0,0.6,1,W P Mullins,D J Casey,1,0,...,,1.047545,1.00416,,,,,,,
1,302858,Majestic Red I,6.0,5.0,0.047619,0,John Hackett,Conor O'Dwyer,2,8,...,1.0,1.01981,1.0,1.0,2.0,40.0,4.0,1.003698,1.055865,1.011652
2,302858,Clearly Canadian,6.0,2.0,0.166667,0,D T Hughes,G Cotter,3,1.5,...,,,,,,,,,,
3,302858,Bernestic Wonder,8.0,1.0,0.058824,0,E McNamara,J Old Jones,4,dist,...,4.0,1.01598,1.000159,1.055469,2.0,,,1.002641,,
4,302858,Beauty's Pride,5.0,6.0,0.038462,0,J J Lennon,T Martin,5,dist,...,,1.001492,1.0,,,,,,,


In [4]:
horses_featurized.shape

(202304, 155)

---

# Drop Columns w/ High `NaN`

In [5]:
for column in horses_featurized.columns:
    print(f"{column} {np.mean(horses_featurized[column].isnull())}")

rid 0.0
horseName 0.0
age 0.0
saddle 0.025565485605820944
decimalPrice 0.0
isFav 0.0
trainerName 0.0
jockeyName 0.0
position 0.0
positionL 0.0
dist 0.0
outHandicap 0.0
RPR 0.07551506643467257
TR 0.4116725324264473
OR 0.4185680955393863
father 3.9544447959506484e-05
mother 5.9316671939259726e-05
gfather 0.000874920911104081
weight 0.0
res_win 0.0
res_place 0.0
res_show 0.0
finishing time 0.0
finishing time ratio 0.0
d_weight 0.24859122904144257
d_last_race 0.24859122904144257
d_first_race 0.24859122904144257
prev_1_position 0.24859122904144257
prev_2_position 0.41180599493831066
prev_3_position 0.5289761942423283
prev_1_finishing_time_ratio 0.24859122904144257
prev_2_finishing_time_ratio 0.41180599493831066
prev_3_finishing_time_ratio 0.5289761942423283
prev_1_global_finishing_time_ratio 0.24859122904144257
prev_2_global_finishing_time_ratio 0.41180599493831066
prev_3_global_finishing_time_ratio 0.5289761942423283
prev_1_position_course 0.6818006564378362
prev_2_position_course 0.829187

Check that `TR` and `OR` are highly correlated with other variables.

In [6]:
horses_featurized[['decimalPrice', 'RPR', 'TR', 'OR']].corr()

Unnamed: 0,decimalPrice,RPR,TR,OR
decimalPrice,1.0,0.38025,0.260033,0.302478
RPR,0.38025,1.0,0.631196,0.771179
TR,0.260033,0.631196,1.0,0.495747
OR,0.302478,0.771179,0.495747,1.0


In [7]:
horses_featurized_trunc = horses_featurized.drop(columns=[
    'TR',
    'OR',
    'father',
    'mother',
    'gfather',
    'd_weight',
    'd_last_race',
    'd_first_race',
    'prev_1_position',
    'prev_2_position',
    'prev_3_position',
    'prev_1_finishing_time_ratio',
    'prev_2_finishing_time_ratio',
    'prev_3_finishing_time_ratio',
    'prev_1_global_finishing_time_ratio',
    'prev_2_global_finishing_time_ratio',
    'prev_3_global_finishing_time_ratio',
    'prev_1_position_course',
    'prev_2_position_course',
    'prev_3_position_course',
    'prev_1_finishing_time_ratio_course',
    'prev_2_finishing_time_ratio_course',
    'prev_3_finishing_time_ratio_course',
    'prev_1_position_metric',
    'prev_2_position_metric',
    'prev_3_position_metric',
    'prev_1_finishing_time_ratio_metric',
    'prev_2_finishing_time_ratio_metric',
    'prev_3_finishing_time_ratio_metric',
    'prev_1_position_ncond',
    'prev_2_position_ncond',
    'prev_3_position_ncond',
    'prev_1_finishing_time_ratio_ncond',
    'prev_2_finishing_time_ratio_ncond',
    'prev_3_finishing_time_ratio_ncond',
    'prev_1_position_runners',
    'prev_2_position_runners',
    'prev_3_position_runners',
    'prev_1_finishing_time_ratio_runners',
    'prev_2_finishing_time_ratio_runners',
    'prev_3_finishing_time_ratio_runners',
    'prev_1_position_month',
    'prev_2_position_month',
    'prev_3_position_month',
    'prev_1_finishing_time_ratio_month',
    'prev_2_finishing_time_ratio_month',
    'prev_3_finishing_time_ratio_month',
    'prev_1_position_temp',
    'prev_2_position_temp',
    'prev_3_position_temp',
    'prev_1_finishing_time_ratio_temp',
    'prev_2_finishing_time_ratio_temp',
    'prev_3_finishing_time_ratio_temp',
    'prev_1_position_msl',
    'prev_2_position_msl',
    'prev_3_position_msl',
    'prev_1_finishing_time_ratio_msl',
    'prev_2_finishing_time_ratio_msl',
    'prev_3_finishing_time_ratio_msl',
    'prev_1_position_rain',
    'prev_2_position_rain',
    'prev_3_position_rain',
    'prev_1_finishing_time_ratio_rain',
    'prev_2_finishing_time_ratio_rain',
    'prev_3_finishing_time_ratio_rain',
    'prev_1_position_rhum',
    'prev_2_position_rhum',
    'prev_3_position_rhum',
    'prev_1_finishing_time_ratio_rhum',
    'prev_2_finishing_time_ratio_rhum',
    'prev_3_finishing_time_ratio_rhum',
])

In [8]:
for column in horses_featurized_trunc.columns:
    print(f"{column} {np.mean(horses_featurized_trunc[column].isnull())}")

rid 0.0
horseName 0.0
age 0.0
saddle 0.025565485605820944
decimalPrice 0.0
isFav 0.0
trainerName 0.0
jockeyName 0.0
position 0.0
positionL 0.0
dist 0.0
outHandicap 0.0
RPR 0.07551506643467257
weight 0.0
res_win 0.0
res_place 0.0
res_show 0.0
finishing time 0.0
finishing time ratio 0.0
jockey_d_last_race 0.013148528946535907
jockey_d_first_race 0.013148528946535907
jockey_prev_1_position 0.013148528946535907
jockey_prev_2_position 0.022990153432458083
jockey_prev_3_position 0.031086879152167037
jockey_prev_1_finishing_time_ratio 0.013148528946535907
jockey_prev_2_finishing_time_ratio 0.022990153432458083
jockey_prev_3_finishing_time_ratio 0.031086879152167037
jockey_prev_1_global_finishing_time_ratio 0.013148528946535907
jockey_prev_2_global_finishing_time_ratio 0.022990153432458083
jockey_prev_3_global_finishing_time_ratio 0.031086879152167037
jockey_prev_1_position_course 0.08721527997469156
jockey_prev_2_position_course 0.13629488294843403
jockey_prev_3_position_course 0.172048006959

In [9]:
horses_featurized_trunc.shape

(202304, 84)

In [10]:
horses_featurized_trunc_dropna = horses_featurized_trunc.dropna()
horses_featurized_trunc_dropna.shape

(142066, 84)

In [12]:
len(np.unique(horses_featurized_trunc['rid']))

20201

In [11]:
len(np.unique(horses_featurized_trunc_dropna['rid']))

18591

In [12]:
for column in horses_featurized_trunc_dropna.columns:
    print(f"{column} {np.mean(horses_featurized_trunc_dropna[column].isnull())}")

rid 0.0
horseName 0.0
age 0.0
saddle 0.0
decimalPrice 0.0
isFav 0.0
trainerName 0.0
jockeyName 0.0
position 0.0
positionL 0.0
dist 0.0
outHandicap 0.0
RPR 0.0
weight 0.0
res_win 0.0
res_place 0.0
res_show 0.0
finishing time 0.0
finishing time ratio 0.0
jockey_d_last_race 0.0
jockey_d_first_race 0.0
jockey_prev_1_position 0.0
jockey_prev_2_position 0.0
jockey_prev_3_position 0.0
jockey_prev_1_finishing_time_ratio 0.0
jockey_prev_2_finishing_time_ratio 0.0
jockey_prev_3_finishing_time_ratio 0.0
jockey_prev_1_global_finishing_time_ratio 0.0
jockey_prev_2_global_finishing_time_ratio 0.0
jockey_prev_3_global_finishing_time_ratio 0.0
jockey_prev_1_position_course 0.0
jockey_prev_2_position_course 0.0
jockey_prev_3_position_course 0.0
jockey_prev_1_finishing_time_ratio_course 0.0
jockey_prev_2_finishing_time_ratio_course 0.0
jockey_prev_3_finishing_time_ratio_course 0.0
jockey_prev_1_position_metric 0.0
jockey_prev_2_position_metric 0.0
jockey_prev_3_position_metric 0.0
jockey_prev_1_finishin

In [13]:
def reshape_horses_in_race(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """
    Given a dataframe `df` that is two horses in a race, reshape this dataframe
    to be one long row containing columns which are the same as before except with
    prefixes `horse_1_`, `horse_2_`.
    """    

    acc = df.iloc[[0]].rename(columns=lambda c: c if c == 'rid' else 'horse1_' + c)
    addition = df.iloc[[1]].rename(columns=lambda c: c if c == 'rid' else 'horse2_' + c)
    acc = pd.merge(left=acc, right=addition, left_on='rid', right_on='rid')

    return acc

In [14]:
df = horses_featurized_trunc_dropna[horses_featurized_trunc_dropna['rid'] == races_featurized.iloc[6]['rid']]
df.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,jockey_prev_3_position_rain,jockey_prev_1_finishing_time_ratio_rain,jockey_prev_2_finishing_time_ratio_rain,jockey_prev_3_finishing_time_ratio_rain,jockey_prev_1_position_rhum,jockey_prev_2_position_rhum,jockey_prev_3_position_rhum,jockey_prev_1_finishing_time_ratio_rhum,jockey_prev_2_finishing_time_ratio_rhum,jockey_prev_3_finishing_time_ratio_rhum
41,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,4.0,1.000634,1.0,1.004078,1.0,8.0,2.0,1.0,1.029073,1.001057
43,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0
44,377929,Ciste,3.0,2.0,0.25,0,J S Bolger,Kevin Manning,4,3.5,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0
45,377929,Prairie Flame,3.0,4.0,0.090909,0,John M Oxx,Johnny Murtagh,5,6.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237


In [15]:
df.head(2)

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,jockey_prev_3_position_rain,jockey_prev_1_finishing_time_ratio_rain,jockey_prev_2_finishing_time_ratio_rain,jockey_prev_3_finishing_time_ratio_rain,jockey_prev_1_position_rhum,jockey_prev_2_position_rhum,jockey_prev_3_position_rhum,jockey_prev_1_finishing_time_ratio_rhum,jockey_prev_2_finishing_time_ratio_rhum,jockey_prev_3_finishing_time_ratio_rhum
41,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,4.0,1.000634,1.0,1.004078,1.0,8.0,2.0,1.0,1.029073,1.001057
43,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0


In [16]:
reshape_horses_in_race(df.head(2))

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0


In [17]:
df.iloc[[1,3]]

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,jockey_prev_3_position_rain,jockey_prev_1_finishing_time_ratio_rain,jockey_prev_2_finishing_time_ratio_rain,jockey_prev_3_finishing_time_ratio_rain,jockey_prev_1_position_rhum,jockey_prev_2_position_rhum,jockey_prev_3_position_rhum,jockey_prev_1_finishing_time_ratio_rhum,jockey_prev_2_finishing_time_ratio_rhum,jockey_prev_3_finishing_time_ratio_rhum
43,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0
45,377929,Prairie Flame,3.0,4.0,0.090909,0,John M Oxx,Johnny Murtagh,5,6.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237


In [18]:
df.iloc[[3,1]]

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,jockey_prev_3_position_rain,jockey_prev_1_finishing_time_ratio_rain,jockey_prev_2_finishing_time_ratio_rain,jockey_prev_3_finishing_time_ratio_rain,jockey_prev_1_position_rhum,jockey_prev_2_position_rhum,jockey_prev_3_position_rhum,jockey_prev_1_finishing_time_ratio_rhum,jockey_prev_2_finishing_time_ratio_rhum,jockey_prev_3_finishing_time_ratio_rhum
45,377929,Prairie Flame,3.0,4.0,0.090909,0,John M Oxx,Johnny Murtagh,5,6.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237
43,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0


In [19]:
import itertools

In [20]:
list(itertools.permutations(range(len(df)), 2))

[(0, 1),
 (0, 2),
 (0, 3),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 3),
 (3, 0),
 (3, 1),
 (3, 2)]

In [21]:
accumulator = []

for rid in tqdm(horses_featurized_trunc_dropna['rid'].unique()[:1000]):
    horses = horses_featurized_trunc_dropna[horses_featurized_trunc_dropna['rid'] == rid]
    
    for fst, snd in list(itertools.permutations(range(len(horses)), 2)):
        
        df = reshape_horses_in_race(horses.iloc[[fst, snd]])
        
        accumulator.append(df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:16<00:00,  5.08it/s]


In [22]:
len(accumulator)

58404

In [23]:
%%time

horses_featurized_jocket_paired_input = pd.concat(accumulator[:8000])
horses_featurized_jocket_paired_input.head()

CPU times: user 4.51 s, sys: 118 ms, total: 4.63 s
Wall time: 4.64 s


Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,4.0,1.000634,1.0,1.004078,1.0,8.0,2.0,1.0,1.029073,1.001057
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0


In [24]:
%%time

horses_featurized_jocket_paired_input = pd.concat(accumulator[:16000])
horses_featurized_jocket_paired_input.head()

CPU times: user 8.77 s, sys: 123 ms, total: 8.89 s
Wall time: 8.91 s


Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,4.0,1.000634,1.0,1.004078,1.0,8.0,2.0,1.0,1.029073,1.001057
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0


In [25]:
%%time

horses_featurized_jocket_paired_input = pd.concat(accumulator[:32000])
horses_featurized_jocket_paired_input.head()

CPU times: user 15.7 s, sys: 268 ms, total: 16 s
Wall time: 16.1 s


Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,4.0,1.000634,1.0,1.004078,1.0,8.0,2.0,1.0,1.029073,1.001057
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0


In [26]:
%%time

horses_featurized_jocket_paired_input = pd.concat(accumulator)
horses_featurized_jocket_paired_input.head()

CPU times: user 29.4 s, sys: 441 ms, total: 29.8 s
Wall time: 29.8 s


Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.022201,1.01314,1.000667,5.0,2.0,1.0,1.008458,1.001409,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0
0,377929,Strawberry Roan,3.0,1.0,0.714286,1,A P O'Brien,C Roche,1,0.0,...,1.0,1.003172,1.000334,1.0,4.0,3.0,2.0,1.004579,1.002265,1.002237
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,4.0,1.000634,1.0,1.004078,1.0,8.0,2.0,1.0,1.029073,1.001057
0,377929,Magical Cliche,3.0,3.0,0.090909,0,D K Weld,Mick Kinane,3,0.75,...,3.0,1.01459,1.00317,1.003625,2.0,6.0,1.0,1.00317,1.010572,1.0


**NOTE**: Did a timing analysis here.

With naive approaches, run into memory issues. Must be smarter about this.

In [31]:
# clear some memory
del horses_featurized
del horses_featurized_trunc

In [27]:
# https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def split_list(a, n):
    k, m = divmod(len(a), n)
    return [a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]

In [28]:
list(split_list([0,1,2,3,4,5,6,7,8,9,10,11,12,13], 3))

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13]]

In [35]:
NUM_PARTITIONS = 50

partitions = split_list(horses_featurized_trunc_dropna['rid'].unique(), NUM_PARTITIONS)

In [36]:
assert set(partitions[0]).intersection(set(partitions[1])) == set()

In [37]:
for i in tqdm(range(NUM_PARTITIONS)):
    accumulator = []
    
    for rid in partitions[i]:
        horses = horses_featurized_trunc_dropna[horses_featurized_trunc_dropna['rid'] == rid]

        for fst, snd in list(itertools.permutations(range(len(horses)), 2)):

            df = reshape_horses_in_race(horses.iloc[[fst, snd]])

            accumulator.append(df)
            
    pd.concat(accumulator).to_csv(
        f"{BASE_DIR}/data/streamline/paired/horses_featurized_jocket_paired_input_{i}.csv", index=False
    )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [1:11:40<00:00, 86.00s/it]


---