# `neural_network_with_race_pairwise_winners.ipynb`

### Author: Anthony Hein

#### Last updated: 11/14/2021

# Overview:

Use the datasets
* `X_train_preprocess_with_race.csv`
* `X_dev_preprocess_with_race.csv`
* `X_test_preprocess_with_race.csv`

and the targets in
* `X_train_pairwise_winner_labels.csv`
* `X_dev_pairwise_winner_labels.csv`
* `X_test_pairwise_winner_labels.csv`

to make a neural network model that tries to predict the pairwise winner between each pair of runners.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `X_train_preprocess_with_race.csv`

In [3]:
X_train = pd.read_csv(f"{BASE_DIR}/data/analysis/X_train_preprocess_with_race.csv", low_memory=False)
X_train.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,pressure_level_1,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4
0,0.2,0.230769,0.19697,0.0,0.0,0.73125,0.692308,0.007631,0.236544,0.1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.4,0.038462,0.072222,0.0,0.0,0.73125,0.74359,0.00505,0.236544,0.1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.133333,0.192308,0.090278,0.0,0.0,0.6625,0.730769,0.00074,0.234655,0.05,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
X_train.shape

(800666, 245)

---

## Load `X_dev_preprocess_with_race.csv`

In [5]:
X_dev = pd.read_csv(f"{BASE_DIR}/data/analysis/X_dev_preprocess_with_race.csv", low_memory=False)
X_dev.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,pressure_level_1,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4
0,0.285714,0.105263,0.2,0.0,0.0,0.608392,0.769231,0.001516,0.629407,0.25,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [6]:
X_dev.shape

(228766, 245)

---

## Load `X_test_preprocess_with_race.csv`

In [7]:
X_test = pd.read_csv(f"{BASE_DIR}/data/analysis/X_test_preprocess_with_race.csv", low_memory=False)
X_test.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,pressure_level_1,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4
0,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0
1,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0
2,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0
3,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0
4,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0


In [8]:
X_test.shape

(114392, 245)

---

## Load Pairwise Winner Labels

In [9]:
X_train_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/x_train_pairwise_winner_labels.csv",
                                            dtype=int,
                                            delimiter=',')

In [10]:
X_dev_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_dev_pairwise_winner_labels.csv",
                                          dtype=int,
                                          delimiter=',')

In [11]:
X_test_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_test_pairwise_winner_labels.csv",
                                           dtype=int,
                                           delimiter=',')

## Neural Network Architecture

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Set the device to use
# CUDA refers to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Fixing Random Seed for Reproducibility
torch.manual_seed(0)
np.random.seed(0) 

In [13]:
X_train_tensor = torch.from_numpy(X_train.to_numpy()).float().to(device)

y_train_tensor = torch.from_numpy(X_train_pairwise_winner_labels).float().to(device)
y_train_tensor = torch.reshape(y_train_tensor, (-1, 1))

X_dev_tensor = torch.from_numpy(X_dev.to_numpy()).float().to(device)

y_dev_tensor = torch.from_numpy(X_dev_pairwise_winner_labels).float().to(device)
y_dev_tensor = torch.reshape(y_dev_tensor, (-1, 1))

In [14]:
X_train_dataset = list(zip(X_train_tensor, y_train_tensor))
X_dev_dataset = list(zip(X_dev_tensor, y_dev_tensor))

In [15]:
class Net(nn.Module):
    def __init__(self, input_size, num_hidden_layers, hidden_layer_sizes):
        super(Net, self).__init__()
        self.hidden_layers = [nn.Linear(input_size, hidden_layer_sizes[0])]
        self.hidden_layers += [
            nn.Linear(hidden_layer_sizes[i-1], hidden_layer_sizes[i])
            for i
            in range(1, num_hidden_layers)
        ]
        self.activation = nn.ReLU()
        self.output = nn.Linear(hidden_layer_sizes[-1], 1)
    
    def forward(self, x):
        z = x
        for hidden_layer in self.hidden_layers:
            z = self.activation(hidden_layer(z))

        return self.output(z)

In [16]:
class Trainer():
    def __init__(self, net, optim, loss_function, train_loader):
        self.net = net
        self.optim = optim
        self.loss_function = loss_function
        self.train_loader = train_loader

    def train(self, epochs):
        losses = []
        for epoch in range(epochs):
            epoch_loss = 0.0
            epoch_steps = 0
            for data in self.train_loader:
                
                X = data[0].to(device)
                y = data[1].to(device)
                
                self.optim.zero_grad()

                preds = self.net.forward(X)

                loss = self.loss_function(preds, y)

                loss.backward()

                self.optim.step()

                epoch_loss += loss.item()
                epoch_steps += 1

            losses.append(epoch_loss / epoch_steps)
            print("epoch [%d]: loss %.3f" % (epoch+1, losses[-1]))
        return losses

In [25]:
## Hyperparameters

num_epochs = 10
batch_size = 256
learning_rate = 0.025

In [26]:
train_loader = DataLoader(dataset=X_train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
dev_loader = DataLoader(dataset=X_dev_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [27]:
net = Net(X_train_tensor.shape[1], 1, [600])
net = net.to(device)
opt = optim.Adam(net.parameters(), lr=learning_rate)
loss_function = nn.BCEWithLogitsLoss()

trainer = Trainer(net=net, optim=opt, loss_function=loss_function, train_loader=train_loader)

losses = trainer.train(num_epochs)

epoch [1]: loss 0.410
epoch [2]: loss 0.331
epoch [3]: loss 0.312
epoch [4]: loss 0.301
epoch [5]: loss 0.294
epoch [6]: loss 0.288
epoch [7]: loss 0.284
epoch [8]: loss 0.280
epoch [9]: loss 0.278
epoch [10]: loss 0.275


Good hyperparameters:

* num_epochs = 10, batch_size = 256, learning_rate = 0.02, Net(X_train_tensor.shape[1], 1, [600]), 

In [179]:
err = 0
tot = 0
with torch.no_grad():
    for data in train_loader:
        
        X = data[0].to(device)
        y = data[1].to(device)

        # raw output of network for X
        preds = torch.round(torch.sigmoid(net(X)))

        tot += y.size(0)

        err += torch.sum(y != preds)

acc_percent = 100 - (100 * err / tot)
print('Accuracy of NN prediction on training set: %5.2f%%' % acc_percent)

Accuracy of NN prediction on training set: 92.90%


In [180]:
err = 0
tot = 0
with torch.no_grad():
    for data in dev_loader:
        
        X = data[0].to(device)
        y = data[1].to(device)

        # raw output of network for X
        preds = torch.round(torch.sigmoid(net(X)))

        tot += y.size(0)

        err += torch.sum(y != preds)

acc_percent = 100 - (100 * err / tot)
print('Accuracy of NN prediction on dev set: %5.2f%%' % acc_percent)

Accuracy of NN prediction on dev set: 92.54%


---