# `random_forest_without_race_without_weather_pairwise_winners.ipynb`

### Author: Anthony Hein

#### Last updated: 11/20/2021

# Overview:

Use the datasets
* `X_train_preprocess_without_race.csv`
* `X_dev_preprocess_without_race.csv`
* `X_test_preprocess_without_race.csv`

and the targets in
* `X_train_pairwise_winner_labels.csv`
* `X_dev_pairwise_winner_labels.csv`
* `X_test_pairwise_winner_labels.csv`

to make a random forests model that tries to predict the pairwise winner between each pair of runners.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Remove Features that Involve Weather

In [3]:
drop_cols = [
    'horse1_jockey_prev_1_position_temp',
    'horse1_jockey_prev_2_position_temp',
    'horse1_jockey_prev_3_position_temp',
    'horse1_jockey_prev_1_finishing_time_ratio_temp',
    'horse1_jockey_prev_2_finishing_time_ratio_temp',
    'horse1_jockey_prev_3_finishing_time_ratio_temp',
    'horse1_jockey_prev_1_position_msl',
    'horse1_jockey_prev_2_position_msl',
    'horse1_jockey_prev_3_position_msl',
    'horse1_jockey_prev_1_finishing_time_ratio_msl',
    'horse1_jockey_prev_2_finishing_time_ratio_msl',
    'horse1_jockey_prev_3_finishing_time_ratio_msl',
    'horse1_jockey_prev_1_position_rain',
    'horse1_jockey_prev_2_position_rain',
    'horse1_jockey_prev_3_position_rain',
    'horse1_jockey_prev_1_finishing_time_ratio_rain',
    'horse1_jockey_prev_2_finishing_time_ratio_rain',
    'horse1_jockey_prev_3_finishing_time_ratio_rain',
    'horse1_jockey_prev_1_position_rhum',
    'horse1_jockey_prev_2_position_rhum',
    'horse1_jockey_prev_3_position_rhum',
    'horse1_jockey_prev_1_finishing_time_ratio_rhum',
    'horse1_jockey_prev_2_finishing_time_ratio_rhum',
    'horse1_jockey_prev_3_finishing_time_ratio_rhum',
    'horse2_jockey_prev_1_position_temp',
    'horse2_jockey_prev_2_position_temp',
    'horse2_jockey_prev_3_position_temp',
    'horse2_jockey_prev_1_finishing_time_ratio_temp',
    'horse2_jockey_prev_2_finishing_time_ratio_temp',
    'horse2_jockey_prev_3_finishing_time_ratio_temp',
    'horse2_jockey_prev_1_position_msl',
    'horse2_jockey_prev_2_position_msl',
    'horse2_jockey_prev_3_position_msl',
    'horse2_jockey_prev_1_finishing_time_ratio_msl',
    'horse2_jockey_prev_2_finishing_time_ratio_msl',
    'horse2_jockey_prev_3_finishing_time_ratio_msl',
    'horse2_jockey_prev_1_position_rain',
    'horse2_jockey_prev_2_position_rain',
    'horse2_jockey_prev_3_position_rain',
    'horse2_jockey_prev_1_finishing_time_ratio_rain',
    'horse2_jockey_prev_2_finishing_time_ratio_rain',
    'horse2_jockey_prev_3_finishing_time_ratio_rain',
    'horse2_jockey_prev_1_position_rhum',
    'horse2_jockey_prev_2_position_rhum',
    'horse2_jockey_prev_3_position_rhum',
    'horse2_jockey_prev_1_finishing_time_ratio_rhum',
    'horse2_jockey_prev_2_finishing_time_ratio_rhum',
    'horse2_jockey_prev_3_finishing_time_ratio_rhum'
]

---

## Load `X_train_preprocess_without_race.csv`

In [4]:
X_train = pd.read_csv(f"{BASE_DIR}/data/analysis/X_train_preprocess_without_race.csv", low_memory=False)
X_train.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.2,0.230769,0.19697,0.0,0.0,0.73125,0.692308,0.007631,0.236544,0.1,...,0.05,0.805104,0.780139,0.783438,0.1,0.025,0.025,0.805104,0.799642,0.780139
1,0.4,0.038462,0.072222,0.0,0.0,0.73125,0.74359,0.00505,0.236544,0.1,...,0.1,0.801416,0.781238,0.79416,0.075,0.05,0.075,0.801416,0.800487,0.785703
2,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.1,0.799909,0.780139,0.786816,0.025,0.1,0.025,0.799642,0.806486,0.780139
3,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.075,0.800176,0.781623,0.78187,0.05,0.075,0.05,0.801163,0.801416,0.780963
4,0.133333,0.192308,0.090278,0.0,0.0,0.6625,0.730769,0.00074,0.234655,0.05,...,0.1,0.799642,0.783601,0.785468,0.125,0.1,0.025,0.803191,0.805104,0.780139


In [5]:
X_train.shape

(800666, 144)

In [6]:
X_train = X_train.drop(columns=drop_cols)
X_train.shape

(800666, 96)

---

## Load `X_dev_preprocess_without_race.csv`

In [7]:
X_dev = pd.read_csv(f"{BASE_DIR}/data/analysis/X_dev_preprocess_without_race.csv", low_memory=False)
X_dev.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.285714,0.105263,0.2,0.0,0.0,0.608392,0.769231,0.001516,0.629407,0.25,...,0.2,0.828972,0.837641,0.843255,0.075,0.05,0.275,0.834583,0.828972,0.853796
1,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.075,0.826039,0.826039,0.842799,0.025,0.025,0.075,0.826039,0.826039,0.842799
2,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.125,0.826413,0.841758,0.844367,0.125,0.05,0.275,0.843955,0.826413,0.841758
3,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.05,0.832401,0.832114,0.828366,0.1,0.075,0.05,0.832401,0.837064,0.828519
4,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.175,0.857255,0.835266,0.834946,0.15,0.025,0.2,0.857255,0.826039,0.837392


In [8]:
X_dev.shape

(228766, 144)

In [9]:
X_dev = X_dev.drop(columns=drop_cols)
X_dev.shape

(228766, 96)

---

## Load `X_test_preprocess_without_race.csv`

In [10]:
X_test = pd.read_csv(f"{BASE_DIR}/data/analysis/X_test_preprocess_without_race.csv", low_memory=False)
X_test.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.2,0.865544,0.847264,0.853154,0.025,0.2,0.3,0.840533,0.850856,0.864844
1,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.15,0.864781,0.840533,0.855519,0.025,0.025,0.025,0.840533,0.840533,0.840533
2,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.075,0.838534,0.843337,0.846422,0.2,0.1,0.125,0.853412,0.842583,0.847859
3,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.25,0.851685,0.849673,0.854624,0.15,0.25,0.2,0.849673,0.854624,0.858051
4,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.25,0.847892,0.842776,0.862423,0.175,0.075,0.275,0.853634,0.844529,0.862845


In [11]:
X_test.shape

(114392, 144)

In [12]:
X_test = X_test.drop(columns=drop_cols)
X_test.shape

(114392, 96)

---

## Load Pairwise Winner Labels

In [13]:
X_train_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/x_train_pairwise_winner_labels.csv",
                                            dtype=int,
                                            delimiter=',')

In [14]:
X_dev_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_dev_pairwise_winner_labels.csv",
                                          dtype=int,
                                          delimiter=',')

In [15]:
X_test_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_test_pairwise_winner_labels.csv",
                                           dtype=int,
                                           delimiter=',')

---

## Random Forest Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import PredefinedSplit, GridSearchCV

## Fixing Random Seed for Reproducibility
np.random.seed(0)

In [17]:
## Hyperparameter Space

space = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 5, 10, 50],
}

In [18]:
X_train_plus_dev = pd.concat([X_train, X_dev])
X_train_plus_dev.shape

(1029432, 96)

In [19]:
X_train_plus_dev_pairwise_winner_labels = np.concatenate(
    (X_train_pairwise_winner_labels, X_dev_pairwise_winner_labels)
)
X_train_plus_dev_pairwise_winner_labels

array([1, 0, 1, ..., 1, 1, 1])

In [20]:
len(X_train), len(X_dev)

(800666, 228766)

In [21]:
test_fold = np.concatenate(
    (np.full(len(X_train), -1),
     np.zeros(len(X_dev)))
)
test_fold

array([-1., -1., -1., ...,  0.,  0.,  0.])

In [22]:
## Predefined Split
ps = PredefinedSplit(test_fold)
ps

PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0]))

In [23]:
rf = RandomForestClassifier()

In [24]:
rf = GridSearchCV(rf, space, verbose=25, refit=False, cv=ps)

In [25]:
search = rf.fit(X_train_plus_dev.to_numpy(), X_train_plus_dev_pairwise_winner_labels)

Fitting 1 folds for each of 32 candidates, totalling 32 fits
[CV 1/1; 1/32] START max_features=auto, min_samples_leaf=1, n_estimators=10.....
[CV 1/1; 1/32] END max_features=auto, min_samples_leaf=1, n_estimators=10;, score=0.743 total time= 2.0min
[CV 1/1; 2/32] START max_features=auto, min_samples_leaf=1, n_estimators=50.....
[CV 1/1; 2/32] END max_features=auto, min_samples_leaf=1, n_estimators=50;, score=0.814 total time= 5.7min
[CV 1/1; 3/32] START max_features=auto, min_samples_leaf=1, n_estimators=100....
[CV 1/1; 3/32] END max_features=auto, min_samples_leaf=1, n_estimators=100;, score=0.841 total time=11.5min
[CV 1/1; 4/32] START max_features=auto, min_samples_leaf=1, n_estimators=200....
[CV 1/1; 4/32] END max_features=auto, min_samples_leaf=1, n_estimators=200;, score=0.857 total time=24.0min
[CV 1/1; 5/32] START max_features=auto, min_samples_leaf=5, n_estimators=10.....
[CV 1/1; 5/32] END max_features=auto, min_samples_leaf=5, n_estimators=10;, score=0.805 total time= 1.2m

In [26]:
print(f"Best Score: {search.best_score_}")
print(f"Best Hyperparameters: {search.best_params_}")

Best Score: 0.8621342332339595
Best Hyperparameters: {'max_features': 'sqrt', 'min_samples_leaf': 10, 'n_estimators': 200}


In [27]:
pd.DataFrame(search.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_leaf,param_n_estimators,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
27,913.31785,0.0,5.937805,0.0,sqrt,10,200,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.862134,0.862134,0.0,1
23,1156.029455,0.0,7.634989,0.0,sqrt,5,200,"{'max_features': 'sqrt', 'min_samples_leaf': 5...",0.860788,0.860788,0.0,2
7,1256.915178,0.0,8.259788,0.0,auto,5,200,"{'max_features': 'auto', 'min_samples_leaf': 5...",0.860351,0.860351,0.0,3
22,577.610196,0.0,3.908439,0.0,sqrt,5,100,"{'max_features': 'sqrt', 'min_samples_leaf': 5...",0.859608,0.859608,0.0,4
19,1217.028372,0.0,8.599396,0.0,sqrt,1,200,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.859087,0.859087,0.0,5
6,646.568997,0.0,4.70729,0.0,auto,5,100,"{'max_features': 'auto', 'min_samples_leaf': 5...",0.85879,0.85879,0.0,6
3,1425.515649,0.0,16.528174,0.0,auto,1,200,"{'max_features': 'auto', 'min_samples_leaf': 1...",0.856976,0.856976,0.0,7
11,1123.404186,0.0,7.243047,0.0,auto,10,200,"{'max_features': 'auto', 'min_samples_leaf': 1...",0.85657,0.85657,0.0,8
26,459.502765,0.0,2.95261,0.0,sqrt,10,100,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.851923,0.851923,0.0,9
10,559.194858,0.0,3.619974,0.0,auto,10,100,"{'max_features': 'auto', 'min_samples_leaf': 1...",0.851621,0.851621,0.0,10


In [28]:
pd.DataFrame(search.cv_results_).to_csv(
    f"{BASE_DIR}/data/analysis/random_forest/without_race_without_weather.csv", index=False
)

In [29]:
import pickle

rf = RandomForestClassifier(
    max_features='sqrt',
    min_samples_leaf=10,
    n_estimators=200,
).fit(X_train.to_numpy(), X_train_pairwise_winner_labels)

with open(f"{BASE_DIR}/data/analysis/random_forest/without_race_without_weather.pkl",'wb') as f:
    pickle.dump(rf, f)

---