In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from horse.data.load_data import DataSet

perform = DataSet()
train, val, test = perform.train_val_test_split()

train.shape, val.shape, test.shape

((43157, 18), (5558, 18), (5721, 18))

In [48]:
import torch.nn as nn
from torch import Tensor, LongTensor
from torch import autograd, device
from torch import optim
import torch
import time
import numpy as np


class LinearRegWEmb(nn.Module):
    """ Simple Concat + Linear txfm for all embeddings """
    def __init__(self, n_num_feats, k_dim_field, k_dim_id) -> None:
        super(LinearRegWEmb, self).__init__()
        self.n_num_feats = n_num_feats
        self.sigmoid = torch.sigmoid
        # init embedding for ids
        self.emb_field = nn.Embedding(len(field2ix), k_dim_field)
        self.emb_jockey = nn.Embedding(len(jockey2ix), k_dim_id)
        self.emb_horse = nn.Embedding(len(horse2ix), k_dim_id)
        self.emb_trainer = nn.Embedding(len(trainer2ix), k_dim_id)
        # output layer
        out_dim = n_num_feats + k_dim_field + 3*k_dim_id
        self.Linear = nn.Linear(out_dim, 1)
        # init all dims
        nn.init.normal_(self.Linear.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_field.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_jockey.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_horse.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_trainer.weight, mean=0, std=0.1)

    def forward(self, x, field, jockey, horse, trainer):
        emb_f = self.emb_field(field)
        emb_j = self.emb_jockey(jockey)
        emb_h = self.emb_horse(horse)
        emb_t = self.emb_trainer(trainer)

        out = self.Linear(torch.concat([x, emb_f, emb_j, emb_h, emb_t], 1))
        return self.sigmoid(out)


def SSE(input, target):
    return (target-input)**2

def BCELoss(input, target):
    return nn.BCEWithLogitsLoss()(input, target)

# setting
batch_size = 20
epochs = 10
# model
K_DIM_F = 4
K_DIM_IX = 16
# optimizer
learning_rate = 5e-5
weight_decay = 1e-3

compute_device = device('cuda') if use_cuda else device('cpu')

model = LinearRegWEmb(n_num_feats=len(numerical_cols), k_dim_field=K_DIM_F, k_dim_id=K_DIM_IX).to(compute_device)
opt = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


# data prep
X_train, y_train = get_feats(perform_train, numerical_cols, y_col, use_cuda)
# Loss
train_loss_by_ep = []
# Average Percision
val_ap_by_ep = []
test_ap_by_ep = []

for ep in range(epochs):
    t0 = time.time()
    ep_loss = []
    for batch_data in horse_data_loader(X_train, y_train, batch_size, shuffle=True):
        x, f, j, h, t, y = batch_data
        model.zero_grad()

        x = autograd.Variable(x)
        f = autograd.Variable(f)
        j = autograd.Variable(j)
        h = autograd.Variable(h)
        t = autograd.Variable(t)

        y_pred = model(x, f, j, h, t)
        loss = BCELoss(y_pred, y)
        loss.mean().backward()
        opt.step()

        ep_loss.append(loss.data.to(compute_device).tolist())
#         print(ep_loss)

    train_loss_by_ep.append(np.sqrt(np.mean(ep_loss)))
    
    # compute AP
    val_ap_by_ep.append(computeAP(perform_val, model, way='max', use_cuda=use_cuda))
    test_ap_by_ep.append(computeAP(perform_test, model, way='max', use_cuda=use_cuda))
    
    t1 = time.time()
    print(f'[{round(t1-t0, 3)}s] Iter={ep}, train loss={round(train_loss_by_ep[-1], 3)}')
    print(f'\t [VAL] AP={round(val_ap_by_ep[-1], 3)}; [TEST] AP={round(test_ap_by_ep[-1], 3)}')

[5.87s] Iter=0, train loss=0.929
	 [VAL] AP=0.092; [TEST] AP=0.102
[5.7s] Iter=1, train loss=0.92
	 [VAL] AP=0.103; [TEST] AP=0.102
[5.974s] Iter=2, train loss=0.912
	 [VAL] AP=0.107; [TEST] AP=0.105
[6.125s] Iter=3, train loss=0.904
	 [VAL] AP=0.107; [TEST] AP=0.106
[6.134s] Iter=4, train loss=0.896
	 [VAL] AP=0.111; [TEST] AP=0.108
[5.927s] Iter=5, train loss=0.889
	 [VAL] AP=0.114; [TEST] AP=0.108
[6.008s] Iter=6, train loss=0.882
	 [VAL] AP=0.114; [TEST] AP=0.11
[6.074s] Iter=7, train loss=0.876
	 [VAL] AP=0.111; [TEST] AP=0.113
[5.852s] Iter=8, train loss=0.871
	 [VAL] AP=0.114; [TEST] AP=0.115
[5.814s] Iter=9, train loss=0.866
	 [VAL] AP=0.116; [TEST] AP=0.114


In [49]:
class LinearRegWEmbv1(nn.Module):
    """ Dot All Embs into 1d for scale reduction """
    def __init__(self, n_num_feats, k_dim_field, k_dim_id) -> None:
        super(LinearRegWEmbv1, self).__init__()
        self.n_num_feats = n_num_feats
        self.sigmoid = torch.sigmoid
        # init embedding for ids
        self.emb_field = nn.Embedding(len(field2ix), k_dim_field)
        self.emb_jockey = nn.Embedding(len(jockey2ix), k_dim_id)
        self.emb_horse = nn.Embedding(len(horse2ix), k_dim_id)
        self.emb_trainer = nn.Embedding(len(trainer2ix), k_dim_id)
        # output layer
        out_dim = n_num_feats + 3
        self.Linear = nn.Linear(out_dim, 1)
        self.Linear_field = nn.Linear(k_dim_field, 1)
        # init all dims
        nn.init.normal_(self.Linear.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_field.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_jockey.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_horse.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_trainer.weight, mean=0, std=0.1)

    def forward(self, x, field, jockey, horse, trainer):
        emb_f = self.emb_field(field)
        emb_j = self.emb_jockey(jockey)
        emb_h = self.emb_horse(horse)
        emb_t = self.emb_trainer(trainer)
        f_val = self.Linear_field(emb_f)
        hj_val = torch.matmul(emb_h, emb_j.T).sum(1).unsqueeze(1)
        ht_val = torch.matmul(emb_h, emb_t.T).sum(1).unsqueeze(1)
        
        out = self.Linear(torch.concat([x, f_val, hj_val, ht_val],1))
        return self.sigmoid(out)
    
# setting
batch_size = 20
epochs = 10
# model
K_DIM_F = 4
K_DIM_IX = 16
# optimizer
learning_rate = 5e-5
weight_decay = 1e-3
use_cuda= True

compute_device = device('cuda') if use_cuda else device('cpu')

model = LinearRegWEmbv1(n_num_feats=len(numerical_cols), k_dim_field=K_DIM_F, k_dim_id=K_DIM_IX).to(compute_device)
opt = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


# data prep
X_train, y_train = get_feats(perform_train, numerical_cols, y_col, use_cuda)
# Loss
train_loss_by_ep = []
# Average Percision
val_ap_by_ep = []
test_ap_by_ep = []

for ep in range(epochs):
    t0 = time.time()
    ep_loss = []
    for batch_data in horse_data_loader(X_train, y_train, batch_size, shuffle=True):
        x, f, j, h, t, y = batch_data
        model.zero_grad()

        x = autograd.Variable(x)
        f = autograd.Variable(f)
        j = autograd.Variable(j)
        h = autograd.Variable(h)
        t = autograd.Variable(t)

        y_pred = model(x, f, j, h, t)
        loss = BCELoss(y_pred, y)
        loss.mean().backward()
        opt.step()

        ep_loss.append(loss.data.to(compute_device).tolist())
#         print(ep_loss)

    train_loss_by_ep.append(np.sqrt(np.mean(ep_loss)))
    
    # compute AP
    val_ap_by_ep.append(computeAP(perform_val, model, way='max', use_cuda=use_cuda))
    test_ap_by_ep.append(computeAP(perform_test, model, way='max', use_cuda=use_cuda))
    
    t1 = time.time()
    print(f'[{round(t1-t0, 3)}s] Iter={ep}, train loss={round(train_loss_by_ep[-1], 3)}')
    print(f'\t [VAL] AP={round(val_ap_by_ep[-1], 3)}; [TEST] AP={round(test_ap_by_ep[-1], 3)}')

[6.287s] Iter=0, train loss=0.939
	 [VAL] AP=0.094; [TEST] AP=0.084
[6.297s] Iter=1, train loss=0.931
	 [VAL] AP=0.096; [TEST] AP=0.085
[6.292s] Iter=2, train loss=0.924
	 [VAL] AP=0.092; [TEST] AP=0.086
[6.304s] Iter=3, train loss=0.917
	 [VAL] AP=0.089; [TEST] AP=0.087
[6.217s] Iter=4, train loss=0.911
	 [VAL] AP=0.097; [TEST] AP=0.087
[6.308s] Iter=5, train loss=0.905
	 [VAL] AP=0.103; [TEST] AP=0.087
[6.163s] Iter=6, train loss=0.9
	 [VAL] AP=0.099; [TEST] AP=0.086
[6.269s] Iter=7, train loss=0.896
	 [VAL] AP=0.095; [TEST] AP=0.087
[6.314s] Iter=8, train loss=0.891
	 [VAL] AP=0.093; [TEST] AP=0.087
[6.3s] Iter=9, train loss=0.887
	 [VAL] AP=0.088; [TEST] AP=0.088


In [63]:
class LinearRegWEmbv2(nn.Module):
    """ Element wise multiplication of embs, and linear comb """
    def __init__(self, n_num_feats, k_dim_field, k_dim_id) -> None:
        super(LinearRegWEmbv2, self).__init__()
        self.n_num_feats = n_num_feats
        self.sigmoid = torch.sigmoid
        # init embedding for ids
        self.emb_field = nn.Embedding(len(field2ix), k_dim_field)
        self.emb_jockey = nn.Embedding(len(jockey2ix), k_dim_id)
        self.emb_horse = nn.Embedding(len(horse2ix), k_dim_id)
        self.emb_trainer = nn.Embedding(len(trainer2ix), k_dim_id)
        # output layer
        out_dim = n_num_feats + k_dim_field + 2*k_dim_id
        self.Linear = nn.Linear(out_dim, 1)
        # init all dims
        nn.init.normal_(self.Linear.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_field.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_jockey.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_horse.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_trainer.weight, mean=0, std=0.1)

    def forward(self, x, field, jockey, horse, trainer):
        emb_f = self.emb_field(field)
        emb_j = self.emb_jockey(jockey)
        emb_h = self.emb_horse(horse)
        emb_t = self.emb_trainer(trainer)
        hj_val = torch.mul(emb_h, emb_j)
        ht_val = torch.mul(emb_h, emb_t)
        
        out = self.Linear(torch.concat([x, emb_f, hj_val, ht_val], 1))
        return self.sigmoid(out)
    
# setting
batch_size = 20
epochs = 10
# model
K_DIM_F = 4
K_DIM_IX = 64
# optimizer
learning_rate = 5e-5
weight_decay = 1e-5
use_cuda= True

compute_device = device('cuda') if use_cuda else device('cpu')

model = LinearRegWEmbv2(n_num_feats=len(numerical_cols), k_dim_field=K_DIM_F, k_dim_id=K_DIM_IX).to(compute_device)
opt = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


# data prep
X_train, y_train = get_feats(perform_train, numerical_cols, y_col, use_cuda)
# Loss
train_loss_by_ep = []
# Average Percision
val_ap_by_ep = []
test_ap_by_ep = []

for ep in range(epochs):
    t0 = time.time()
    ep_loss = []
    for batch_data in horse_data_loader(X_train, y_train, batch_size, shuffle=True):
        x, f, j, h, t, y = batch_data
        model.zero_grad()

        x = autograd.Variable(x)
        f = autograd.Variable(f)
        j = autograd.Variable(j)
        h = autograd.Variable(h)
        t = autograd.Variable(t)

        y_pred = model(x, f, j, h, t)
        loss = BCELoss(y_pred, y)
        loss.mean().backward()
        opt.step()

        ep_loss.append(loss.data.to(compute_device).tolist())
#         print(ep_loss)

    train_loss_by_ep.append(np.sqrt(np.mean(ep_loss)))
    
    # compute AP
    val_ap_by_ep.append(computeAP(perform_val, model, way='max', use_cuda=use_cuda))
    test_ap_by_ep.append(computeAP(perform_test, model, way='max', use_cuda=use_cuda))
    
    t1 = time.time()
    print(f'[{round(t1-t0, 3)}s] Iter={ep}, train loss={round(train_loss_by_ep[-1], 5)}')
    print(f'\t [VAL] AP={round(val_ap_by_ep[-1], 5)}; [TEST] AP={round(test_ap_by_ep[-1], 5)}')

[5.971s] Iter=0, train loss=0.92947
	 [VAL] AP=0.10262; [TEST] AP=0.10737
[5.859s] Iter=1, train loss=0.92199
	 [VAL] AP=0.09825; [TEST] AP=0.10847
[5.885s] Iter=2, train loss=0.91505
	 [VAL] AP=0.10262; [TEST] AP=0.11096
[6.173s] Iter=3, train loss=0.90867
	 [VAL] AP=0.10699; [TEST] AP=0.11179
[6.484s] Iter=4, train loss=0.90281
	 [VAL] AP=0.11572; [TEST] AP=0.11261
[6.057s] Iter=5, train loss=0.8974
	 [VAL] AP=0.12227; [TEST] AP=0.11565
[5.993s] Iter=6, train loss=0.89241
	 [VAL] AP=0.13537; [TEST] AP=0.11896
[6.04s] Iter=7, train loss=0.88784
	 [VAL] AP=0.131; [TEST] AP=0.12117
[6.099s] Iter=8, train loss=0.88361
	 [VAL] AP=0.131; [TEST] AP=0.122
[5.689s] Iter=9, train loss=0.87973
	 [VAL] AP=0.13319; [TEST] AP=0.1231


In [76]:
class GmfMlp(nn.Module):
    """ Dot All Embs into 1d for scale reduction """
    def __init__(self, n_num_feats, k_dim_field, k_dim_id, num_layers, p_dropout=0.05) -> None:
        super(GmfMlp, self).__init__()
        self.n_num_feats = n_num_feats
        self.sigmoid = torch.sigmoid
        # init embedding for ids
        self.emb_field = nn.Embedding(len(field2ix), k_dim_field)
        self.emb_jockey = nn.Embedding(len(jockey2ix), k_dim_id)
        self.emb_horse = nn.Embedding(len(horse2ix), k_dim_id)
        self.emb_trainer = nn.Embedding(len(trainer2ix), k_dim_id)
        # MLP layer
        feat_dim = n_num_feats + k_dim_field + 2*k_dim_id
        MLP_sizes = [int(feat_dim*(0.5**i)) for i in range(num_layers+1)]
        MLP_Layer=[]
        for i in range(num_layers):
            MLP_Layer.append(nn.Dropout(p_dropout))
            MLP_Layer.append(nn.Linear(MLP_sizes[i], MLP_sizes[i+1]))
            MLP_Layer.append(nn.ReLU())  
        self.MLP_Layer = nn.Sequential(*MLP_Layer)
        self.Linear = nn.Linear(MLP_sizes[-1], 1)
        # init all dims
        nn.init.normal_(self.Linear.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_field.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_jockey.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_horse.weight, mean=0, std=0.1)
        nn.init.normal_(self.emb_trainer.weight, mean=0, std=0.1)

    def forward(self, x, field, jockey, horse, trainer):
        emb_f = self.emb_field(field)
        emb_j = self.emb_jockey(jockey)
        emb_h = self.emb_horse(horse)
        emb_t = self.emb_trainer(trainer)
        hj_val = torch.mul(emb_h, emb_j)
        ht_val = torch.mul(emb_h, emb_t)
        
        hidden_input = torch.concat([x, emb_f, hj_val, ht_val], 1)
        out = self.MLP_Layer(hidden_input)
        out = self.Linear(out)
        return self.sigmoid(out)
    
model = GmfMlp(n_num_feats=6, k_dim_field=4, k_dim_id=16, num_layers=3)
model

GmfMlp(
  (emb_field): Embedding(9, 4)
  (emb_jockey): Embedding(131, 16)
  (emb_horse): Embedding(4399, 16)
  (emb_trainer): Embedding(129, 16)
  (MLP_Layer): Sequential(
    (0): Dropout(p=0.05, inplace=False)
    (1): Linear(in_features=42, out_features=21, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.05, inplace=False)
    (4): Linear(in_features=21, out_features=10, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.05, inplace=False)
    (7): Linear(in_features=10, out_features=5, bias=True)
    (8): ReLU()
  )
  (Linear): Linear(in_features=5, out_features=1, bias=True)
)

In [86]:

# setting
batch_size = 25
epochs = 10
# model
K_DIM_F = 4
K_DIM_IX = 64
num_layers = 2
# optimizer
learning_rate = 8e-6
weight_decay = 1e-5
use_cuda= True

compute_device = device('cuda') if use_cuda else device('cpu')

model = GmfMlp(n_num_feats=len(numerical_cols), k_dim_field=K_DIM_F, k_dim_id=K_DIM_IX, num_layers=num_layers, p_dropout=0.1).to(compute_device)
opt = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


# data prep
X_train, y_train = get_feats(perform_train, numerical_cols, y_col, use_cuda)
# Loss
train_loss_by_ep = []
# Average Percision
val_ap_by_ep = []
test_ap_by_ep = []

for ep in range(epochs):
    t0 = time.time()
    ep_loss = []
    for batch_data in horse_data_loader(X_train, y_train, batch_size, shuffle=True):
        x, f, j, h, t, y = batch_data
        model.zero_grad()

        x = autograd.Variable(x)
        f = autograd.Variable(f)
        j = autograd.Variable(j)
        h = autograd.Variable(h)
        t = autograd.Variable(t)

        y_pred = model(x, f, j, h, t)

        loss = BCELoss(y_pred, y)
        loss.mean().backward()
        opt.step()

        ep_loss.append(loss.data.to(compute_device).tolist())
#         print(ep_loss)

    train_loss_by_ep.append(np.sqrt(np.mean(ep_loss)))
    
    # compute AP
    val_ap_by_ep.append(computeAP(perform_val, model, way='max', use_cuda=use_cuda))
    test_ap_by_ep.append(computeAP(perform_test, model, way='max', use_cuda=use_cuda))
    
    t1 = time.time()
    print(f'[{round(t1-t0, 3)}s] Iter={ep}, train loss={round(train_loss_by_ep[-1], 3)}')
    print(f'\t [VAL] AP={round(val_ap_by_ep[-1], 3)}; [TEST] AP={round(test_ap_by_ep[-1], 3)}')

[6.134s] Iter=0, train loss=0.959
	 [VAL] AP=0.087; [TEST] AP=0.094
[6.2s] Iter=1, train loss=0.956
	 [VAL] AP=0.087; [TEST] AP=0.105
[6.224s] Iter=2, train loss=0.953
	 [VAL] AP=0.085; [TEST] AP=0.111
[6.21s] Iter=3, train loss=0.95
	 [VAL] AP=0.105; [TEST] AP=0.125
[6.01s] Iter=4, train loss=0.946
	 [VAL] AP=0.127; [TEST] AP=0.116
[6.158s] Iter=5, train loss=0.942
	 [VAL] AP=0.12; [TEST] AP=0.106
[6.336s] Iter=6, train loss=0.937
	 [VAL] AP=0.092; [TEST] AP=0.106
[6.131s] Iter=7, train loss=0.933
	 [VAL] AP=0.118; [TEST] AP=0.114
[6.283s] Iter=8, train loss=0.927
	 [VAL] AP=0.12; [TEST] AP=0.111
[6.32s] Iter=9, train loss=0.922
	 [VAL] AP=0.109; [TEST] AP=0.109


### 3.2 Assumption 2

Since all horses within a race provide us with precious information, telling us which one is faster.

Hence, it is reasonable to calculate the pairwise loss with log-sigmoid loss.