# `making_paired_input_non_solution.ipynb`

### Author: Anthony Hein

#### Last updated: 11/8/2021

# Overview:

This notebook pairs the input.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_theme(style="whitegrid")

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `horses_featurized_horse_jockey.csv`

In [3]:
horses_featurized = pd.read_csv(f"{BASE_DIR}/data/streamline/horses_featurized_horse_jockey.csv", low_memory=False)
horses_featurized.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,jockey_prev_3_position_rain,jockey_prev_1_finishing_time_ratio_rain,jockey_prev_2_finishing_time_ratio_rain,jockey_prev_3_finishing_time_ratio_rain,jockey_prev_1_position_rhum,jockey_prev_2_position_rhum,jockey_prev_3_position_rhum,jockey_prev_1_finishing_time_ratio_rhum,jockey_prev_2_finishing_time_ratio_rhum,jockey_prev_3_finishing_time_ratio_rhum
0,302858,Kings Return,6.0,4.0,0.6,1,W P Mullins,D J Casey,1,0,...,,1.047545,1.00416,,,,,,,
1,302858,Majestic Red I,6.0,5.0,0.047619,0,John Hackett,Conor O'Dwyer,2,8,...,1.0,1.01981,1.0,1.0,2.0,40.0,4.0,1.003698,1.055865,1.011652
2,302858,Clearly Canadian,6.0,2.0,0.166667,0,D T Hughes,G Cotter,3,1.5,...,,,,,,,,,,
3,302858,Bernestic Wonder,8.0,1.0,0.058824,0,E McNamara,J Old Jones,4,dist,...,4.0,1.01598,1.000159,1.055469,2.0,,,1.002641,,
4,302858,Beauty's Pride,5.0,6.0,0.038462,0,J J Lennon,T Martin,5,dist,...,,1.001492,1.0,,,,,,,


In [4]:
horses_featurized.shape

(202304, 155)

---

# Drop Columns w/ High `NaN`

In [5]:
for column in horses_featurized.columns:
    print(f"{column} {np.mean(horses_featurized[column].isnull())}")

rid 0.0
horseName 0.0
age 0.0
saddle 0.025565485605820944
decimalPrice 0.0
isFav 0.0
trainerName 0.0
jockeyName 0.0
position 0.0
positionL 0.0
dist 0.0
outHandicap 0.0
RPR 0.07551506643467257
TR 0.4116725324264473
OR 0.4185680955393863
father 3.9544447959506484e-05
mother 5.9316671939259726e-05
gfather 0.000874920911104081
weight 0.0
res_win 0.0
res_place 0.0
res_show 0.0
finishing time 0.0
finishing time ratio 0.0
d_weight 0.24859122904144257
d_last_race 0.24859122904144257
d_first_race 0.24859122904144257
prev_1_position 0.24859122904144257
prev_2_position 0.41180599493831066
prev_3_position 0.5289761942423283
prev_1_finishing_time_ratio 0.24859122904144257
prev_2_finishing_time_ratio 0.41180599493831066
prev_3_finishing_time_ratio 0.5289761942423283
prev_1_global_finishing_time_ratio 0.24859122904144257
prev_2_global_finishing_time_ratio 0.41180599493831066
prev_3_global_finishing_time_ratio 0.5289761942423283
prev_1_position_course 0.6818006564378362
prev_2_position_course 0.829187

Check that `TR` and `OR` are highly correlated with other variables.

In [6]:
horses_featurized[['decimalPrice', 'RPR', 'TR', 'OR']].corr()

Unnamed: 0,decimalPrice,RPR,TR,OR
decimalPrice,1.0,0.38025,0.260033,0.302478
RPR,0.38025,1.0,0.631196,0.771179
TR,0.260033,0.631196,1.0,0.495747
OR,0.302478,0.771179,0.495747,1.0


In [7]:
horses_featurized_trunc = horses_featurized.drop(columns=[
    'TR',
    'OR',
    'father',
    'mother',
    'gfather',
    'jockey_d_last_race',
    'jockey_d_first_race',
    'jockey_prev_1_position',
    'jockey_prev_2_position',
    'jockey_prev_3_position',
    'jockey_prev_1_finishing_time_ratio',
    'jockey_prev_2_finishing_time_ratio',
    'jockey_prev_3_finishing_time_ratio',
    'jockey_prev_1_global_finishing_time_ratio',
    'jockey_prev_2_global_finishing_time_ratio',
    'jockey_prev_3_global_finishing_time_ratio',
    'jockey_prev_1_position_course',
    'jockey_prev_2_position_course',
    'jockey_prev_3_position_course',
    'jockey_prev_1_finishing_time_ratio_course',
    'jockey_prev_2_finishing_time_ratio_course',
    'jockey_prev_3_finishing_time_ratio_course',
    'jockey_prev_1_position_metric',
    'jockey_prev_2_position_metric',
    'jockey_prev_3_position_metric',
    'jockey_prev_1_finishing_time_ratio_metric',
    'jockey_prev_2_finishing_time_ratio_metric',
    'jockey_prev_3_finishing_time_ratio_metric',
    'jockey_prev_1_position_ncond',
    'jockey_prev_2_position_ncond',
    'jockey_prev_3_position_ncond',
    'jockey_prev_1_finishing_time_ratio_ncond',
    'jockey_prev_2_finishing_time_ratio_ncond',
    'jockey_prev_3_finishing_time_ratio_ncond',
    'jockey_prev_1_position_runners',
    'jockey_prev_2_position_runners',
    'jockey_prev_3_position_runners',
    'jockey_prev_1_finishing_time_ratio_runners',
    'jockey_prev_2_finishing_time_ratio_runners',
    'jockey_prev_3_finishing_time_ratio_runners',
    'jockey_prev_1_position_month',
    'jockey_prev_2_position_month',
    'jockey_prev_3_position_month',
    'jockey_prev_1_finishing_time_ratio_month',
    'jockey_prev_2_finishing_time_ratio_month',
    'jockey_prev_3_finishing_time_ratio_month',
    'jockey_prev_1_position_temp',
    'jockey_prev_2_position_temp',
    'jockey_prev_3_position_temp',
    'jockey_prev_1_finishing_time_ratio_temp',
    'jockey_prev_2_finishing_time_ratio_temp',
    'jockey_prev_3_finishing_time_ratio_temp',
    'jockey_prev_1_position_msl',
    'jockey_prev_2_position_msl',
    'jockey_prev_3_position_msl',
    'jockey_prev_1_finishing_time_ratio_msl',
    'jockey_prev_2_finishing_time_ratio_msl',
    'jockey_prev_3_finishing_time_ratio_msl',
    'jockey_prev_1_position_rain',
    'jockey_prev_2_position_rain',
    'jockey_prev_3_position_rain',
    'jockey_prev_1_finishing_time_ratio_rain',
    'jockey_prev_2_finishing_time_ratio_rain',
    'jockey_prev_3_finishing_time_ratio_rain',
    'jockey_prev_1_position_rhum',
    'jockey_prev_2_position_rhum',
    'jockey_prev_3_position_rhum',
    'jockey_prev_1_finishing_time_ratio_rhum',
    'jockey_prev_2_finishing_time_ratio_rhum',
    'jockey_prev_3_finishing_time_ratio_rhum',
])

In [8]:
for column in horses_featurized_trunc.columns:
    print(f"{column} {np.mean(horses_featurized_trunc[column].isnull())}")

rid 0.0
horseName 0.0
age 0.0
saddle 0.025565485605820944
decimalPrice 0.0
isFav 0.0
trainerName 0.0
jockeyName 0.0
position 0.0
positionL 0.0
dist 0.0
outHandicap 0.0
RPR 0.07551506643467257
weight 0.0
res_win 0.0
res_place 0.0
res_show 0.0
finishing time 0.0
finishing time ratio 0.0
d_weight 0.24859122904144257
d_last_race 0.24859122904144257
d_first_race 0.24859122904144257
prev_1_position 0.24859122904144257
prev_2_position 0.41180599493831066
prev_3_position 0.5289761942423283
prev_1_finishing_time_ratio 0.24859122904144257
prev_2_finishing_time_ratio 0.41180599493831066
prev_3_finishing_time_ratio 0.5289761942423283
prev_1_global_finishing_time_ratio 0.24859122904144257
prev_2_global_finishing_time_ratio 0.41180599493831066
prev_3_global_finishing_time_ratio 0.5289761942423283
prev_1_position_course 0.6818006564378362
prev_2_position_course 0.8291877570389118
prev_3_position_course 0.8866062954761151
prev_1_finishing_time_ratio_course 0.6818006564378362
prev_2_finishing_time_rati

In [9]:
horses_featurized_trunc.shape

(202304, 85)

In [10]:
horses_featurized_trunc_dropna = horses_featurized_trunc.dropna()
horses_featurized_trunc_dropna.shape

(2213, 85)

In [11]:
len(np.unique(horses_featurized_trunc_dropna['rid']))

1454

---