# `decision_tree_without_race_pairwise_winners.ipynb`

### Author: Anthony Hein

#### Last updated: 11/20/2021

# Overview:

Use the datasets
* `X_train_preprocess_without_race.csv`
* `X_dev_preprocess_without_race.csv`
* `X_test_preprocess_without_race.csv`

and the targets in
* `X_train_pairwise_winner_labels.csv`
* `X_dev_pairwise_winner_labels.csv`
* `X_test_pairwise_winner_labels.csv`

to make a decision tree that tries to predict the pairwise winner between each pair of runners.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `X_train_preprocess_without_race.csv`

In [3]:
X_train = pd.read_csv(f"{BASE_DIR}/data/analysis/X_train_preprocess_without_race.csv", low_memory=False)
X_train.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.2,0.230769,0.19697,0.0,0.0,0.73125,0.692308,0.007631,0.236544,0.1,...,0.05,0.805104,0.780139,0.783438,0.1,0.025,0.025,0.805104,0.799642,0.780139
1,0.4,0.038462,0.072222,0.0,0.0,0.73125,0.74359,0.00505,0.236544,0.1,...,0.1,0.801416,0.781238,0.79416,0.075,0.05,0.075,0.801416,0.800487,0.785703
2,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.1,0.799909,0.780139,0.786816,0.025,0.1,0.025,0.799642,0.806486,0.780139
3,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.075,0.800176,0.781623,0.78187,0.05,0.075,0.05,0.801163,0.801416,0.780963
4,0.133333,0.192308,0.090278,0.0,0.0,0.6625,0.730769,0.00074,0.234655,0.05,...,0.1,0.799642,0.783601,0.785468,0.125,0.1,0.025,0.803191,0.805104,0.780139


In [4]:
X_train.shape

(800666, 144)

---

## Load `X_dev_preprocess_without_race.csv`

In [5]:
X_dev = pd.read_csv(f"{BASE_DIR}/data/analysis/X_dev_preprocess_without_race.csv", low_memory=False)
X_dev.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.285714,0.105263,0.2,0.0,0.0,0.608392,0.769231,0.001516,0.629407,0.25,...,0.2,0.828972,0.837641,0.843255,0.075,0.05,0.275,0.834583,0.828972,0.853796
1,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.075,0.826039,0.826039,0.842799,0.025,0.025,0.075,0.826039,0.826039,0.842799
2,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.125,0.826413,0.841758,0.844367,0.125,0.05,0.275,0.843955,0.826413,0.841758
3,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.05,0.832401,0.832114,0.828366,0.1,0.075,0.05,0.832401,0.837064,0.828519
4,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.175,0.857255,0.835266,0.834946,0.15,0.025,0.2,0.857255,0.826039,0.837392


In [6]:
X_dev.shape

(228766, 144)

---

## Load `X_test_preprocess_without_race.csv`

In [7]:
X_test = pd.read_csv(f"{BASE_DIR}/data/analysis/X_test_preprocess_without_race.csv", low_memory=False)
X_test.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.2,0.865544,0.847264,0.853154,0.025,0.2,0.3,0.840533,0.850856,0.864844
1,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.15,0.864781,0.840533,0.855519,0.025,0.025,0.025,0.840533,0.840533,0.840533
2,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.075,0.838534,0.843337,0.846422,0.2,0.1,0.125,0.853412,0.842583,0.847859
3,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.25,0.851685,0.849673,0.854624,0.15,0.25,0.2,0.849673,0.854624,0.858051
4,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.25,0.847892,0.842776,0.862423,0.175,0.075,0.275,0.853634,0.844529,0.862845


In [8]:
X_test.shape

(114392, 144)

---

## Load Pairwise Winner Labels

In [9]:
X_train_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/x_train_pairwise_winner_labels.csv",
                                            dtype=int,
                                            delimiter=',')

In [10]:
X_dev_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_dev_pairwise_winner_labels.csv",
                                          dtype=int,
                                          delimiter=',')

In [11]:
X_test_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_test_pairwise_winner_labels.csv",
                                           dtype=int,
                                           delimiter=',')

---

## Decision Tree Classifier Model

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import PredefinedSplit, GridSearchCV

## Fixing Random Seed for Reproducibility
np.random.seed(0)

In [13]:
## Hyperparameter Space

space = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 4, 8],
}

In [14]:
X_train_plus_dev = pd.concat([X_train, X_dev])
X_train_plus_dev.shape

(1029432, 144)

In [15]:
X_train_plus_dev_pairwise_winner_labels = np.concatenate(
    (X_train_pairwise_winner_labels, X_dev_pairwise_winner_labels)
)
X_train_plus_dev_pairwise_winner_labels

array([1, 0, 1, ..., 1, 1, 1])

In [16]:
len(X_train), len(X_dev)

(800666, 228766)

In [17]:
test_fold = np.concatenate(
    (np.full(len(X_train), -1),
     np.zeros(len(X_dev)))
)
test_fold

array([-1., -1., -1., ...,  0.,  0.,  0.])

In [18]:
## Predefined Split
ps = PredefinedSplit(test_fold)
ps

PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0]))

In [19]:
dt = DecisionTreeClassifier()

In [20]:
dt = GridSearchCV(dt, space, verbose=25, refit=False, cv=ps)

In [21]:
search = dt.fit(X_train_plus_dev.to_numpy(), X_train_plus_dev_pairwise_winner_labels)

Fitting 1 folds for each of 108 candidates, totalling 108 fits
[CV 1/1; 1/108] START criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, splitter=best
[CV 1/1; 1/108] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, splitter=best;, score=0.841 total time= 1.2min
[CV 1/1; 2/108] START criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, splitter=random
[CV 1/1; 2/108] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, splitter=random;, score=0.786 total time=  13.2s
[CV 1/1; 3/108] START criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=5, splitter=best
[CV 1/1; 3/108] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=5, splitter=best;, score=0.841 total time=  44.2s
[CV 1/1; 4/108] START criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=5, splitter=random
[CV 1/1; 4/108] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=5, splitt

[CV 1/1; 34/108] END criterion=gini, max_depth=10, min_samples_leaf=8, min_samples_split=5, splitter=random;, score=0.820 total time=  20.8s
[CV 1/1; 35/108] START criterion=gini, max_depth=10, min_samples_leaf=8, min_samples_split=10, splitter=best
[CV 1/1; 35/108] END criterion=gini, max_depth=10, min_samples_leaf=8, min_samples_split=10, splitter=best;, score=0.904 total time= 1.1min
[CV 1/1; 36/108] START criterion=gini, max_depth=10, min_samples_leaf=8, min_samples_split=10, splitter=random
[CV 1/1; 36/108] END criterion=gini, max_depth=10, min_samples_leaf=8, min_samples_split=10, splitter=random;, score=0.780 total time=  20.5s
[CV 1/1; 37/108] START criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, splitter=best
[CV 1/1; 37/108] END criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, splitter=best;, score=0.814 total time= 1.9min
[CV 1/1; 38/108] START criterion=gini, max_depth=20, min_samples_leaf=1, min_samples_split=2, splitter=random


[CV 1/1; 67/108] END criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=2, splitter=best;, score=0.827 total time=  37.2s
[CV 1/1; 68/108] START criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=2, splitter=random
[CV 1/1; 68/108] END criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=2, splitter=random;, score=0.724 total time=  11.8s
[CV 1/1; 69/108] START criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=5, splitter=best
[CV 1/1; 69/108] END criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=5, splitter=best;, score=0.827 total time=  36.9s
[CV 1/1; 70/108] START criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=5, splitter=random
[CV 1/1; 70/108] END criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=5, splitter=random;, score=0.825 total time=  18.7s
[CV 1/1; 71/108] START criterion=entropy, max_depth=5, min_samples_leaf=8, min_samples_split=10, s

[CV 1/1; 100/108] END criterion=entropy, max_depth=20, min_samples_leaf=4, min_samples_split=5, splitter=random;, score=0.875 total time=  27.7s
[CV 1/1; 101/108] START criterion=entropy, max_depth=20, min_samples_leaf=4, min_samples_split=10, splitter=best
[CV 1/1; 101/108] END criterion=entropy, max_depth=20, min_samples_leaf=4, min_samples_split=10, splitter=best;, score=0.883 total time= 1.4min
[CV 1/1; 102/108] START criterion=entropy, max_depth=20, min_samples_leaf=4, min_samples_split=10, splitter=random
[CV 1/1; 102/108] END criterion=entropy, max_depth=20, min_samples_leaf=4, min_samples_split=10, splitter=random;, score=0.888 total time=  26.0s
[CV 1/1; 103/108] START criterion=entropy, max_depth=20, min_samples_leaf=8, min_samples_split=2, splitter=best
[CV 1/1; 103/108] END criterion=entropy, max_depth=20, min_samples_leaf=8, min_samples_split=2, splitter=best;, score=0.899 total time= 1.4min
[CV 1/1; 104/108] START criterion=entropy, max_depth=20, min_samples_leaf=8, min_s

In [22]:
print(f"Best Score: {search.best_score_}")
print(f"Best Hyperparameters: {search.best_params_}")

Best Score: 0.9049334254216098
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'splitter': 'best'}


In [24]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_splitter,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,74.290251,0.0,0.166583,0.0,gini,5,1,2,best,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.841292,0.841292,0.0,58
1,12.844604,0.0,0.403162,0.0,gini,5,1,2,random,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.785877,0.785877,0.0,91
2,43.962635,0.0,0.214883,0.0,gini,5,1,5,best,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.841292,0.841292,0.0,58
3,13.208908,0.0,0.198514,0.0,gini,5,1,5,random,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.780964,0.780964,0.0,93
4,41.974117,0.0,0.184403,0.0,gini,5,1,10,best,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.841292,0.841292,0.0,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,27.075179,0.0,0.189619,0.0,entropy,20,8,2,random,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.892327,0.892327,0.0,26
104,85.813122,0.0,0.179066,0.0,entropy,20,8,5,best,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.899032,0.899032,0.0,20
105,27.511585,0.0,0.185908,0.0,entropy,20,8,5,random,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.892550,0.892550,0.0,25
106,85.534676,0.0,0.177749,0.0,entropy,20,8,10,best,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.900300,0.900300,0.0,16


In [25]:
pd.DataFrame(search.cv_results_).to_csv(
    f"{BASE_DIR}/data/analysis/decision_tree/without_race_with_weather.csv", index=False
)

In [26]:
import pickle

dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=10,
    splitter='best',
).fit(X_train.to_numpy(), X_train_pairwise_winner_labels)

with open(f"{BASE_DIR}/data/analysis/decision_tree/without_race_with_weather.pkl",'wb') as f:
    pickle.dump(dt, f)

---