# `make_labels.ipynb`

### Author: Anthony Hein

#### Last updated: 11/14/2021

# Overview:

Make labels for the datasets used in analysis.

---

## Setup

In [4]:
from datetime import datetime
import git
import os
import re
from typing import List
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `X_train_everything.csv`

In [7]:
X_train_everything = pd.read_csv(f"{BASE_DIR}/data/analysis/X_train_everything.csv", low_memory=False)
X_train_everything.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,341451,Dance Design,3.0,6.0,0.181818,0,D K Weld,Mick Kinane,2,1.5,...,0,1,0,0,0,0,1,0,0,1.601872
1,341451,Idris,6.0,1.0,0.066667,0,J S Bolger,Kevin Manning,5,nk,...,0,1,0,0,0,0,1,0,0,1.601872
2,50025,Azra,2.0,11.0,0.090909,0,J S Bolger,Kevin Manning,3,1,...,0,1,0,0,0,0,1,0,0,2.103465
3,50025,Azra,2.0,11.0,0.090909,0,J S Bolger,Kevin Manning,3,1,...,0,1,0,0,0,0,1,0,0,2.103465
4,50025,Johan Cruyff,2.0,5.0,0.083333,0,A P O'Brien,Johnny Murtagh,5,nk,...,0,1,0,0,0,0,1,0,0,2.103465


In [8]:
X_train_everything.shape

(800666, 295)

---

## Load `X_dev_everything.csv`

In [9]:
X_dev_everything = pd.read_csv(f"{BASE_DIR}/data/analysis/X_dev_everything.csv", low_memory=False)
X_dev_everything.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,159686,Gussy Goose,4.0,2.0,0.181818,0,David Wachman,Wayne Lordan,5,1.75,...,1,0,0,1,0,0,0,0,1,1.852846
1,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846
2,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846
3,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846
4,159686,Mothers Finest,4.0,4.0,0.142857,0,Adrian Paul Keatley,Gary Carroll,7,2.5,...,1,0,0,1,0,0,0,0,1,1.852846


In [10]:
X_dev_everything.shape

(228766, 295)

---

## Load `X_test_everything.csv`

In [11]:
X_test_everything = pd.read_csv(f"{BASE_DIR}/data/analysis/X_test_everything.csv", low_memory=False)
X_test_everything.head()

Unnamed: 0,rid,horse1_horseName,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_trainerName,horse1_jockeyName,horse1_position,horse1_positionL,...,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,entropy of odds
0,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
1,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
2,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
3,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015
4,136782,Sestriere,3.0,7.0,0.029412,0,Kevin Prendergast,Chris Hayes,7,0.75,...,1,0,0,0,0,0,1,0,0,2.306015


In [12]:
X_test_everything.shape

(114392, 295)

---

## Pairwise Winner

If `horse1` placed higher (i.e. place is numerically lower) than `horse2`, the label will be **1**.

If `horse2` placed higher (i.e. place is numerically lower) than `horse2`, the label will be **0**.

Else, the label will be **0**.

In [14]:
places = X_train_everything[['horse1_horseName', 'horse1_position', 'horse2_horseName', 'horse2_position']]
places.head()

Unnamed: 0,horse1_horseName,horse1_position,horse2_horseName,horse2_position
0,Dance Design,2,Idris,5
1,Idris,5,Dance Design,2
2,Azra,3,Johan Cruyff,5
3,Azra,3,Beautiful Fire,9
4,Johan Cruyff,5,Azra,3


In [15]:
places[places['horse1_position'] == places['horse2_position']]

Unnamed: 0,horse1_horseName,horse1_position,horse2_horseName,horse2_position
11432,Eloquent Way,2,Back To Bavaria,2
11485,Back To Bavaria,2,Eloquent Way,2
12006,Mr McKen,3,Clangigi,3
12007,Clangigi,3,Mr McKen,3
14987,Khatani,1,General Cloney,1
...,...,...,...,...
790555,Johann Bach,40,Elusive Heights,40
793598,Rainfall Radar,4,I'm All You Need,4
793607,I'm All You Need,4,Rainfall Radar,4
793872,Pullman Brown,5,Le Vagabond,5


In [35]:
def get_pairwise_winner(row: pd.core.frame.DataFrame) -> int:
    """
    Implements the labeling scheme described above.
    """
    if row['horse1_position'] < row['horse2_position']:
        return 1
    elif row['horse1_position'] > row['horse2_position']:
        return 0
    else:
        return 0

In [36]:
assert get_pairwise_winner(places.iloc[0]) == 1
assert get_pairwise_winner(places.iloc[1]) == 0
assert get_pairwise_winner(places.iloc[11432]) == 0
assert get_pairwise_winner(places.iloc[11485]) == 0

In [37]:
pairwise_winner_labels = places.apply(get_pairwise_winner, axis=1)
pairwise_winner_labels

0         1
1         0
2         1
3         1
4         0
         ..
800661    0
800662    0
800663    1
800664    1
800665    1
Length: 800666, dtype: int64

Sanity checks:

In [38]:
pairwise_winner_labels.value_counts()

0    400488
1    400178
dtype: int64

In [39]:
pairwise_winner_labels = np.array(pairwise_winner_labels, dtype=int)
pairwise_winner_labels[:20]

array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0])

Compute for all three datasets and save to files:

In [40]:
np.savetxt(f"{BASE_DIR}/data/analysis/X_train_pairwise_winner_labels.csv",
           np.array(X_train_everything.apply(get_pairwise_winner, axis=1), dtype=int),
           fmt='%i',
           delimiter=',')

In [41]:
np.savetxt(f"{BASE_DIR}/data/analysis/X_dev_pairwise_winner_labels.csv",
           np.array(X_dev_everything.apply(get_pairwise_winner, axis=1), dtype=int),
           fmt='%i',
           delimiter=',')

In [42]:
np.savetxt(f"{BASE_DIR}/data/analysis/X_test_pairwise_winner_labels.csv",
           np.array(X_test_everything.apply(get_pairwise_winner, axis=1), dtype=int),
           fmt='%i',
           delimiter=',')

---