In [22]:
import pandas as pd
import numpy as np
import os
import torch
import pickle
import datetime
import time

import torch
import torch.nn as nn
from datetime import datetime
import os
import matplotlib.pyplot as plt

In [23]:
class MLP(nn.Module):
    def __init__(self, input_size, layers):
        super(MLP, self).__init__()
        self.layers = nn.ModuleList([nn.Flatten(), nn.Linear(input_size, layers[0])])
        for idx, size in enumerate(layers[:-1]):
            self.layers.append(nn.Linear(size, layers[idx + 1]))
            self.layers.append(nn.ReLU())
        self.layers.append(nn.Linear(layers[-1], 1))
    def forward(self, x):
        return self.layers(x)

In [24]:
def train_model (x_tensor, y_tensor, layers):
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    # print(f"Using {device} device")


    in_path = os.path.join("drive","MyDrive","Colab Notebooks","HKJC-ML")

    # x_tensor = torch.load(os.path.join(in_path, "x_tensor")).to(torch.float32).to(device)
    # y_tensor = torch.load(os.path.join(in_path, "y_tensor")).to(torch.float32).to(device)

    dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor)

    rng = torch.Generator().manual_seed(42)
    training_set, val_set = torch.utils.data.random_split(dataset, [0.8, 0.2], generator=rng)

    input_size = x_tensor.shape[1]
    model = MLP(input_size, layers).to(device)

    learning_rate = 1e-3
    batch_size = 64

    train_dataloader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()

    epochs = 5000
    patience = 250

    train_loss_plot = []
    val_loss_plot = []
    epochs_plot = []

    best_val_loss = float('inf')
    epochs_without_improvement = 0

    for e in range(epochs):
        # if e % 10 == 0:
        #     print(f"Epoch {e}\n-------------------------------")
        size = len(train_dataloader.dataset)
        # Set the model to training mode - important for batch normalization and dropout layers
        # Unnecessary in this situation but added for best practices
        model.train()
        train_loss_sum = 0
        num_train_batches = 0
        for batch, (X, y) in enumerate(train_dataloader):
            # Compute prediction and loss
            # X = torch.swapaxes(X, 0, 2)
            pred = model(X)
            y = torch.unsqueeze(y, dim=1)
            loss = loss_fn(pred, y)
            train_loss_sum += loss.item()
            num_train_batches += 1
            # Backpropagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if batch % 100 == 0:
                loss, current = loss.item(), (batch + 1) * len(X)
                # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

        train_loss_plot.append(train_loss_sum/num_train_batches)

        # Set the model to evaluation mode - important for batch normalization and dropout layers
        # Unnecessary in this situation but added for best practices
        model.eval()
        size = len(val_dataloader.dataset)
        num_val_batches = len(val_dataloader)
        val_loss, correct = 0, 0

        # Evaluating the model with torch.no_grad() ensures that no gradients are computed during val mode
        # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
        with torch.no_grad():
            for X, y in val_dataloader:
                pred = model(X)
                y = torch.unsqueeze(y, dim=1)
                val_loss += loss_fn(pred, y).item()
                # correct += (pred == y).type(torch.float).sum().item()

        val_loss /= num_val_batches
        val_loss_plot.append(val_loss)
        epochs_plot.append(e+1)

        # if e % 10 == 0:
        # print(f"val loss: {val_loss:>8f} \n")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
            best_model_state = model.state_dict()
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                break

    # plt.plot(epochs_plot, train_loss_plot, label = "training loss")
    # plt.plot(epochs_plot, val_loss_plot, label = "validation loss")
    # plt.legend()
    # print('best val loss', best_val_loss)

    file_name = f"{datetime.now().strftime('%Y_%m_%d_%H_%M')}_{batch_size}_{e}_{str(f'{best_val_loss:.2g}').split('.')[-1]}"

    out_path = os.path.join("drive","MyDrive","Colab Notebooks","HKJC-ML", "model_configs", "hkjc5", file_name)
    torch.save(best_model_state, out_path)

    print(file_name, 'saved', 'best val loss', best_val_loss)

    return file_name


In [25]:
rank_cols = ['total_stakes_rank','horse_weight_rank','horse_handicap_rank','horse_odds_rank','horse_rating_rank','days_since_import_rank',
            'jockey_age_rank','jockey_rides_rank','jockey_stakes_rank','jockey_same_race_wins_rank']

layers_to_try = [[64,32,16,8],[64,32,16,8,4],[32,64,32,16,8],[64,128,64,32,16,8],[64,128,256,128,64,32,16,8],
            [64,128,256,512,256,128,64,32,16,8],[64,128,256,512,1024,256,128,64,32,16,8]]


In [26]:
# dict = {'cols_kept':[],'layers':[],'file':[]}

# df = pd.DataFrame(dict)
# df.to_csv(os.path.join('data','5_ordinal_mean_tensor','model_names.csv'))

In [27]:
in_path = os.path.join('data','5_ordinal_mean_tensor','evaluation')

entire_df = pd.DataFrame()

for file_name in [f for f in os.listdir(in_path) if not f.startswith(".")]:
    df = pd.read_csv(os.path.join(in_path, file_name), index_col=0)

    entire_df = pd.concat([entire_df, df])

model_names = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','model_names.csv'), index_col=0)

for _ in range(50):
    while True:
        c = random.choice(rank_cols)
        layers = random.choice(layers_to_try)

        for idx, row in model_names.iterrows():
            if row['cols_kept'] != c or row['layers'] != layers:
                break

    cols_to_drop = rank_cols.copy()
    cols_to_drop.remove(c)
    cols_to_drop.append(['race_index','place','finish_time'])

    x_df = entire_df.drop(cols_to_drop, axis=1)

    # finish time in seconds
    y_df = entire_df['finish_time']
    x = x_df.to_numpy()
    y = y_df.to_numpy()
    x_tensor = torch.from_numpy(x)
    y_tensor = torch.from_numpy(y)
    # torch.save(x_tensor, os.path.join(out_path, 'finish_time', "x_tensor"))
    # torch.save(y_tensor, os.path.join(out_path, 'finish_time', "y_tensor"))

    model_config_name = train_model(x_tensor, y_tensor, layers)

    model_names_dict = model_names.to_dict('list')
    model_names_dict['cols_kept'].append(c)
    model_names_dict['layers'].append(layers)
    model_names_dict['file'].append(model_config_name)
    pd.DataFrame(model_names_dict).to_csv(os.path.join('data','5_ordinal_mean_tensor','model_names.csv'))

KeyboardInterrupt: 