# `preprocess_data.ipynb`

### Author: Anthony Hein

#### Last updated: 11/14/2021

# Overview:

Preprocess the data by scaling the data and removing variables which are not used during prediction (i.e. labels or row identifiers).

---

## Setup

In [3]:
from datetime import datetime
import git
import os
import re
from typing import List
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `X_train_everything.csv`

In [5]:
X_train_everything = pd.read_csv(f"{BASE_DIR}/data/analysis/X_train_everything.csv", low_memory=False)
X_train_everything.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,341451,Dance Design,3.0,6.0,0.181818,0,D K Weld,Mick Kinane,2,1.5,...,0,1,0,0,0,0,1,0,0,1.601872
1,341451,Idris,6.0,1.0,0.066667,0,J S Bolger,Kevin Manning,5,nk,...,0,1,0,0,0,0,1,0,0,1.601872
2,50025,Azra,2.0,11.0,0.090909,0,J S Bolger,Kevin Manning,3,1,...,0,1,0,0,0,0,1,0,0,2.103465
3,50025,Azra,2.0,11.0,0.090909,0,J S Bolger,Kevin Manning,3,1,...,0,1,0,0,0,0,1,0,0,2.103465
4,50025,Johan Cruyff,2.0,5.0,0.083333,0,A P O'Brien,Johnny Murtagh,5,nk,...,0,1,0,0,0,0,1,0,0,2.103465


In [6]:
X_train_everything.shape

(800666, 295)

---

## Load `X_dev_everything.csv`

In [7]:
X_dev_everything = pd.read_csv(f"{BASE_DIR}/data/analysis/X_dev_everything.csv", low_memory=False)
X_dev_everything.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,159686,Gussy Goose,4.0,2.0,0.181818,0,David Wachman,Wayne Lordan,5,1.75,...,1,0,0,1,0,0,0,0,1,1.852846
1,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846
2,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846
3,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846
4,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846


In [8]:
X_dev_everything.shape

(228766, 295)

---

## Load `X_test_everything.csv`

In [9]:
X_test_everything = pd.read_csv(f"{BASE_DIR}/data/analysis/X_test_everything.csv", low_memory=False)
X_test_everything.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
1,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
2,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
3,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
4,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015


In [10]:
X_test_everything.shape

(114392, 295)

---

## Drop Columns

The columns we drop either contain the result or are unecessary for training.

In [12]:
list(X_train_everything.columns)

['rid',
 'horse1_horseName',
 'horse1_age',
 'horse1_saddle',
 'horse1_decimalPrice',
 'horse1_isFav',
 'horse1_trainerName',
 'horse1_jockeyName',
 'horse1_position',
 'horse1_positionL',
 'horse1_dist',
 'horse1_outHandicap',
 'horse1_RPR',
 'horse1_weight',
 'horse1_res_win',
 'horse1_res_place',
 'horse1_res_show',
 'horse1_finishing time',
 'horse1_finishing time ratio',
 'horse1_jockey_d_last_race',
 'horse1_jockey_d_first_race',
 'horse1_jockey_prev_1_position',
 'horse1_jockey_prev_2_position',
 'horse1_jockey_prev_3_position',
 'horse1_jockey_prev_1_finishing_time_ratio',
 'horse1_jockey_prev_2_finishing_time_ratio',
 'horse1_jockey_prev_3_finishing_time_ratio',
 'horse1_jockey_prev_1_global_finishing_time_ratio',
 'horse1_jockey_prev_2_global_finishing_time_ratio',
 'horse1_jockey_prev_3_global_finishing_time_ratio',
 'horse1_jockey_prev_1_position_course',
 'horse1_jockey_prev_2_position_course',
 'horse1_jockey_prev_3_position_course',
 'horse1_jockey_prev_1_finishing_time_

In [13]:
drop_cols = [
    'rid',
    'horse1_horseName',
    'horse1_trainerName',
    'horse1_jockeyName',
    'horse1_position',
    'horse1_positionL',
    'horse1_dist',
    'horse1_res_win',
    'horse1_res_place',
    'horse1_res_show',
    'horse1_finishing time',
    'horse1_finishing time ratio',
    'horse2_horseName',
    'horse2_trainerName',
    'horse2_jockeyName',
    'horse2_position',
    'horse2_positionL',
    'horse2_dist',
    'horse2_res_win',
    'horse2_res_place',
    'horse2_res_show',
    'horse2_finishing time',
    'horse2_finishing time ratio',
    'course',
    'title',
    'winningTime',
    'metric',
    'ncond',
    'class',
    'runners',
    '1st_place_rank_in_odds',
    '2nd_place_rank_in_odds',
    '3rd_place_rank_in_odds',
    '1st_rank_in_odds_place',
    '2nd_rank_in_odds_place',
    '3rd_rank_in_odds_place',
    'placeAvailable',
    'showAvailable',
    'favoriteWon',
    'favoritePlaced',
    'favoriteShowed',
    'lat',
    'lng',
    'datetime',
    'station no',
    'station name',
    'station lat',
    'station lng',
    'station reading date',
    'entropy of odds'
]

X_train_everything_dropped = X_train_everything.drop(columns=drop_cols)
X_dev_everything_dropped = X_dev_everything.drop(columns=drop_cols)
X_test_everything_dropped = X_test_everything.drop(columns=drop_cols)

In [15]:
X_train_everything_dropped.shape

(800666, 245)

In [16]:
list(X_train_everything_dropped.columns)

['horse1_age',
 'horse1_saddle',
 'horse1_decimalPrice',
 'horse1_isFav',
 'horse1_outHandicap',
 'horse1_RPR',
 'horse1_weight',
 'horse1_jockey_d_last_race',
 'horse1_jockey_d_first_race',
 'horse1_jockey_prev_1_position',
 'horse1_jockey_prev_2_position',
 'horse1_jockey_prev_3_position',
 'horse1_jockey_prev_1_finishing_time_ratio',
 'horse1_jockey_prev_2_finishing_time_ratio',
 'horse1_jockey_prev_3_finishing_time_ratio',
 'horse1_jockey_prev_1_global_finishing_time_ratio',
 'horse1_jockey_prev_2_global_finishing_time_ratio',
 'horse1_jockey_prev_3_global_finishing_time_ratio',
 'horse1_jockey_prev_1_position_course',
 'horse1_jockey_prev_2_position_course',
 'horse1_jockey_prev_3_position_course',
 'horse1_jockey_prev_1_finishing_time_ratio_course',
 'horse1_jockey_prev_2_finishing_time_ratio_course',
 'horse1_jockey_prev_3_finishing_time_ratio_course',
 'horse1_jockey_prev_1_position_metric',
 'horse1_jockey_prev_2_position_metric',
 'horse1_jockey_prev_3_position_metric',
 'hor

Sanity check all numeric columns.

In [18]:
assert len(X_train_everything_dropped.select_dtypes(exclude=np.number).columns.tolist()) == 0

---

## Scaling

In [19]:
X_train_everything_dropped_scaled = X_train_everything_dropped.copy()
X_dev_everything_dropped_scaled = X_dev_everything_dropped.copy()
X_test_everything_dropped_scaled = X_test_everything_dropped.copy()

We apply the simplest scaling so that the data is in $[0,1]$.

In [21]:
X_train_everything_dropped["year"] / max(X_train_everything_dropped["year"])

0         0.990079
1         0.990079
2         0.990079
3         0.990079
4         0.990079
            ...   
800661    1.000000
800662    1.000000
800663    1.000000
800664    1.000000
800665    1.000000
Name: year, Length: 800666, dtype: float64

In [26]:
for col in X_train_everything_dropped.columns:
    X_train_everything_dropped_scaled[col] = X_train_everything_dropped[col] / max(X_train_everything_dropped[col])
    X_dev_everything_dropped_scaled[col] = X_dev_everything_dropped[col] / max(X_dev_everything_dropped[col])
    X_test_everything_dropped_scaled[col] = X_test_everything_dropped[col] / max(X_test_everything_dropped[col])

---

## Remove Race Features

In [27]:
list(X_train_everything_dropped_scaled.columns)

['horse1_age',
 'horse1_saddle',
 'horse1_decimalPrice',
 'horse1_isFav',
 'horse1_outHandicap',
 'horse1_RPR',
 'horse1_weight',
 'horse1_jockey_d_last_race',
 'horse1_jockey_d_first_race',
 'horse1_jockey_prev_1_position',
 'horse1_jockey_prev_2_position',
 'horse1_jockey_prev_3_position',
 'horse1_jockey_prev_1_finishing_time_ratio',
 'horse1_jockey_prev_2_finishing_time_ratio',
 'horse1_jockey_prev_3_finishing_time_ratio',
 'horse1_jockey_prev_1_global_finishing_time_ratio',
 'horse1_jockey_prev_2_global_finishing_time_ratio',
 'horse1_jockey_prev_3_global_finishing_time_ratio',
 'horse1_jockey_prev_1_position_course',
 'horse1_jockey_prev_2_position_course',
 'horse1_jockey_prev_3_position_course',
 'horse1_jockey_prev_1_finishing_time_ratio_course',
 'horse1_jockey_prev_2_finishing_time_ratio_course',
 'horse1_jockey_prev_3_finishing_time_ratio_course',
 'horse1_jockey_prev_1_position_metric',
 'horse1_jockey_prev_2_position_metric',
 'horse1_jockey_prev_3_position_metric',
 'hor

In [29]:
drop_cols = [
    'margin',
    'dist to station',
    'temp',
    'msl',
    'rain',
    'rhum',
    'station reading timedelta',
    'course__Ballinrobe',
    'course__Bellewstown',
    'course__Clonmel',
    'course__Cork',
    'course__Curragh',
    'course__Down Royal',
    'course__Downpatrick',
    'course__Dundalk',
    'course__Fairyhouse',
    'course__Galway',
    'course__Gowran Park',
    'course__Killarney',
    'course__Leopardstown',
    'course__Limerick',
    'course__Listowel',
    'course__Naas',
    'course__Navan',
    'course__Other',
    'course__Punchestown',
    'course__Roscommon',
    'course__Sligo',
    'course__Tipperary',
    'course__Tralee',
    'course__Tramore',
    'course__Wexford',
    'metric__1005.0',
    'metric__1206.0',
    'metric__1306.5',
    'metric__1407.0',
    'metric__1507.5',
    'metric__1609.0',
    'metric__1709.5',
    'metric__1810.0',
    'metric__1910.5',
    'metric__2011.0',
    'metric__2111.5',
    'metric__2212.0',
    'metric__2413.0',
    'metric__2614.0',
    'metric__2815.0',
    'metric__3218.0',
    'metric__3419.0',
    'metric__3620.0',
    'metric__4022.0',
    'metric__Other',
    'ncond__0',
    'ncond__1',
    'ncond__2',
    'ncond__4',
    'ncond__5',
    'ncond__6',
    'ncond__9',
    'ncond__11',
    'ncond__12',
    'ncond__Other',
    'runners__6',
    'runners__7',
    'runners__8',
    'runners__9',
    'runners__10',
    'runners__11',
    'runners__12',
    'runners__13',
    'runners__14',
    'runners__Other',
    'month',
    'month__3',
    'month__4',
    'month__5',
    'month__6',
    'month__7',
    'month__8',
    'month__9',
    'month__10',
    'month__11',
    'month__12',
    'month__Other',
    'year',
    'temp_level_0',
    'temp_level_1',
    'temp_level_2',
    'temp_level_3',
    'temp_level_4',
    'pressure_level_0',
    'pressure_level_1',
    'pressure_level_2',
    'pressure_level_3',
    'pressure_level_4',
    'is_raining',
    'rhum_level_0',
    'rhum_level_1',
    'rhum_level_2',
    'rhum_level_3',
    'rhum_level_4'
]


X_train_everything_dropped_scaled_wo_race = X_train_everything_dropped_scaled.drop(columns=drop_cols)
X_dev_everything_dropped_scaled_wo_race = X_dev_everything_dropped_scaled.drop(columns=drop_cols)
X_test_everything_dropped_scaled_wo_race = X_test_everything_dropped_scaled.drop(columns=drop_cols)

In [31]:
X_train_everything_dropped_scaled_wo_race.shape

(800666, 144)

In [32]:
X_train_everything_dropped_scaled_wo_race.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.2,0.230769,0.19697,0.0,0.0,0.73125,0.692308,0.007631,0.236544,0.1,...,0.05,0.805104,0.780139,0.783438,0.1,0.025,0.025,0.805104,0.799642,0.780139
1,0.4,0.038462,0.072222,0.0,0.0,0.73125,0.74359,0.00505,0.236544,0.1,...,0.1,0.801416,0.781238,0.79416,0.075,0.05,0.075,0.801416,0.800487,0.785703
2,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.1,0.799909,0.780139,0.786816,0.025,0.1,0.025,0.799642,0.806486,0.780139
3,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.075,0.800176,0.781623,0.78187,0.05,0.075,0.05,0.801163,0.801416,0.780963
4,0.133333,0.192308,0.090278,0.0,0.0,0.6625,0.730769,0.00074,0.234655,0.05,...,0.1,0.799642,0.783601,0.785468,0.125,0.1,0.025,0.803191,0.805104,0.780139


---

## Save Dataframes

In [33]:
X_train_everything_dropped_scaled.to_csv(
    f"{BASE_DIR}/data/analysis/X_train_preprocess_with_race.csv",
    index=False
)

In [34]:
X_dev_everything_dropped_scaled.to_csv(
    f"{BASE_DIR}/data/analysis/X_dev_preprocess_with_race.csv",
    index=False
)

In [35]:
X_test_everything_dropped_scaled.to_csv(
    f"{BASE_DIR}/data/analysis/X_test_preprocess_with_race.csv",
    index=False
)

In [36]:
X_train_everything_dropped_scaled_wo_race.to_csv(
    f"{BASE_DIR}/data/analysis/X_train_preprocess_without_race.csv",
    index=False
)

In [37]:
X_dev_everything_dropped_scaled_wo_race.to_csv(
    f"{BASE_DIR}/data/analysis/X_dev_preprocess_without_race.csv",
    index=False
)

In [38]:
X_test_everything_dropped_scaled_wo_race.to_csv(
    f"{BASE_DIR}/data/analysis/X_test_preprocess_without_race.csv",
    index=False
)

---

## Save Identifying Information

While we will certainly not train on the race id or horse name, we may certainly need to refer back to it to evaluate more macroscopic metrics than simple accuracy. We will call this information "identifiers".

In [39]:
X_train_everything[['rid', 'horse1_horseName', 'horse2_horseName']].to_csv(
    f"{BASE_DIR}/data/analysis/X_train_identifiers.csv",
    index=False
)

In [40]:
X_dev_everything[['rid', 'horse1_horseName', 'horse2_horseName']].to_csv(
    f"{BASE_DIR}/data/analysis/X_dev_identifiers.csv",
    index=False
)

In [41]:
X_test_everything[['rid', 'horse1_horseName', 'horse2_horseName']].to_csv(
    f"{BASE_DIR}/data/analysis/X_test_identifiers.csv",
    index=False
)

---