In [48]:
import torch
import torch.nn as nn
import os

import numpy as np 
import pandas as pd
import datetime
import pickle
import time
import random
import math

In [49]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [50]:
class MLP(nn.Module):
    def __init__(self, input_size, layers):
        super(MLP, self).__init__()
        modules = nn.ModuleList([nn.Flatten(), nn.Linear(input_size, layers[0]), nn.ReLU()])
        for idx, size in enumerate(layers[:-1]):
            modules.append(nn.Linear(size, layers[idx + 1]))
            modules.append(nn.ReLU())
        modules.append(nn.Linear(layers[-1], 1))
        self.layers = nn.Sequential(*modules)

    def forward(self, x):
        return self.layers(x)

In [51]:
def load_model (input_size, layers, file):
    model_path = os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML/model_configs/hkjc5', file)
    
    input_size = int(input_size)
    layers = layers.strip('][').split(', ')
    layers_int = []
    for l in layers:
        layers_int.append(int(l))
    # print(input_size, layers_int)
    # print(type(input_size), type(layers_int))
    
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    # print(f"Using {device} device")
    model = MLP(input_size, layers_int).to(device)
    # model = ConvNet(input_size, hidden_size, output_size).to(device)

    model.load_state_dict(torch.load(model_path, map_location="cpu"))
    model.eval()
    # print(file, 'loaded')
    return model

In [52]:
def ordinal_normalise (df, cols_to_keep):    
    in_path = os.path.join('/Users/alexlee/Documents/Coding/HKJC-ML/hkjc5','data','5_ordinal_mean_tensor')
    for ordinal_file in [f for f in os.listdir() if '.csv' in f]:
        ordinal_dict = pd.read_csv(os.path.join(in_path,'ordinals',ordinal_file), index_col=0).to_dict(orient='list')
        for key in ordinal_dict:
            ordinal_dict[key] = ordinal_dict[key][0]

        c = ordinal_file.replace('_ordinal.csv','')
        df[c].replace(ordinal_dict, inplace=True)

    cols_to_rank = [str(c).replace('_rank','') for c in cols_to_keep if 'rank' in c]

    for c in cols_to_rank:
        c = c.replace('"','').replace("'",'')
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[f'{c}_rank'] = df[c].rank(method='dense', ascending=False)

    mean_std_df = pd.read_csv(os.path.join(in_path,'mean_std.csv'), index_col=0)
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[c] = (df[c] - mean_std_df.loc['mean', c]) / mean_std_df.loc['std', c]
        df[c] = df[c].replace(np.nan, mean_std_df.loc['mean', c])
    return df

In [53]:
def key_func (x):
    x = x.replace('.csv','')
    race_no = x.split('_')[-1]
    if len(str(race_no)) == 1:
        race_no = '0' + str(race_no)
    x = '_'.join(x.split('_')[:-1])
    x += str(race_no)
    return int(x)

In [54]:

def evaluate (cols_to_keep, layers, file):
    model_wins_df = pd.DataFrame(columns=['race','win','trio_win'])

    in_path = os.path.join('/Users/alexlee/Documents/Coding/HKJC-ML/hkjc5',"data","5_ordinal_mean_tensor",'evaluation')

    model = load_model(len(cols_to_keep), layers, file)
    
    for race_file_name in sorted([f for f in os.listdir(os.path.join(in_path)) if not f.startswith(".")], key=key_func):
        race_df = pd.read_csv(os.path.join(in_path, race_file_name), index_col=0)

        # ignore if race has less than 6 horses
        if len(race_df.index) < 6:
            continue

        # win condition
        win = False
        trio_win = False

        race_location = race_df['race_location'].unique()[0]
        race_df = ordinal_normalise(race_df, cols_to_keep)
        race_df.sort_values('place', inplace=True)
        odds_df = race_df['horse_odds']
        race_df.drop(["place"], axis=1, inplace=True)
        race_df = pd.DataFrame(race_df, columns=cols_to_keep)
        
        # check for single win bet win
        finish_times = []
        for index, data in race_df.iterrows():
            # if index == 0:
            #     winning_bet_odds = (float(data["horse_odds"]) * means_std.loc["std", "horse_odds"]) + means_std.loc["mean", "horse_odds"]
            data = pd.to_numeric(data)
            input_np = data.to_numpy()
            input = torch.Tensor(input_np).unsqueeze(dim=0)
            finish_times.append(model(input).detach().numpy()[0])
        lost = False
        for finish_time in finish_times[1:]:
            if finish_times[0] > finish_time: # if first not shortest time
                lost = True
                break

        # check if trio win
        if all(max(finish_times[:3]) < t for t in finish_times[3:]):
            trio_win = True

        if not lost:
            if not finish_times[0] in finish_times[1:]:
                win = True

        model_wins_df.loc[-1] = [race_file_name.replace('.csv',''), int(win), int(trio_win)]
        model_wins_df.reset_index(drop=True, inplace=True)

    return model_wins_df

In [55]:
def get_race_info (df):
    in_path = os.path.join('/Users/alexlee/Documents/Coding/HKJC-ML/hkjc5',"data","4_races")

    race_info_cols = ['race_location','race_class','race_going','race_distance','race_surface','race_course']
    new_df_cols = ['race_location','race_class','race_going','race_distance','race_surface','race_course','win','trio_win']
    new_dict = {}
    for col in new_df_cols:
        new_dict[col] = []
    # new_df = pd.DataFrame(columns=new_df_cols)
    
    for index, row in df.iterrows():
        race_df = pd.read_csv(os.path.join(in_path, f"{row['race']}.csv"), index_col=0)
        for col in race_info_cols:
            # print(race_df[col].unique()[0])
            new_dict[col].append(race_df[col].unique()[0])
            # new_df[col].loc[-1] = race_df[col].unique()[0]
        new_dict['win'].append(row['win'])
        new_dict['trio_win'].append(row['trio_win'])
        # df_to_concat = pd.DataFrame(race_df, columns=race_info_cols)
        # new_df = pd.concat([race_df, df_to_concat])
        # new_df.reset_index(drop=True, inplace=True)

    # new_df['win'] = df['win']
    # new_df['trio_win'] = df['trio_win']
        
    new_df = pd.DataFrame(new_dict)

    new_df.reset_index(drop=True, inplace=True)
    return new_df

In [56]:
models_df = pd.read_csv(os.path.join('/Users/alexlee/Documents/Coding/HKJC-ML/hkjc5','data','5_ordinal_mean_tensor','model_names.csv'), index_col=0)
out_path = os.path.join('data','1_analyse')

models_df.sort_values('money_st', ascending=False, inplace=True)
cols_to_keep = str(models_df.iloc[0]['cols_kept']).strip('][').replace('"','').replace("'",'').split(', ')
layers = models_df.iloc[0]['layers']
file = models_df.iloc[0]['file']

model_data_path = os.path.join(out_path,f'{file}.csv')

if not os.path.exists(model_data_path):
    print('st model', file)

    model_wins_df = evaluate (cols_to_keep, layers, file)
    model_wins_df = get_race_info (model_wins_df)
    model_wins_df.to_csv(model_data_path)

models_df.sort_values('money_hv', ascending=False, inplace=True)
cols_to_keep = str(models_df.iloc[0]['cols_kept']).strip('][').replace('"','').replace("'",'').split(', ')
layers = models_df.iloc[0]['layers']
file = models_df.iloc[0]['file']

model_data_path = os.path.join(out_path,f'{file}.csv')

if not os.path.exists(model_data_path):
    print('hv model', file)

    model_wins_df = evaluate (cols_to_keep, layers, file)
    model_wins_df = get_race_info (model_wins_df)
    model_wins_df.to_csv(model_data_path)
    

st model 2023_11_30_09_05_32_64_128_256_512_256_128_64_32_16_8_64_255_723


hv model 2024_01_03_21_04_64_128_256_256_128_64_32_16_8_4_64_313_00337


In [63]:
in_path = os.path.join('data','1_analyse')
oridnal_path = os.path.join('/Users/alexlee/Documents/Coding/HKJC-ML/hkjc5','data','5_ordinal_mean_tensor','ordinals')
mean_std_path = os.path.join('data','2_ordinal_normalise','mean_std')
out_path = os.path.join('data','2_ordinal_normalise')

for file_name in [f for f in os.listdir(in_path) if 'csv' in f]:
    df = pd.read_csv(os.path.join(in_path, file_name), index_col=0)

    df['won'] = pd.to_numeric(df['win'] == 1)
    df['not_won'] = pd.to_numeric(df['win'] == 0)

    for ordinal_file in [f for f in os.listdir(oridnal_path) if '.csv' in f]:
        c = ordinal_file.replace('_ordinal.csv','')
        if not c in df.columns:
            continue

        ordinal_dict = pd.read_csv(os.path.join(oridnal_path,ordinal_file), index_col=0).to_dict(orient='list')
        for key in ordinal_dict:
            ordinal_dict[key] = ordinal_dict[key][0]
        df[c].replace(ordinal_dict, inplace=True)

    mean_std_dict = {}
        
    for c in df.columns:
        if c in ['won','not_won']:
            continue

        df[c] = pd.to_numeric(df[c], errors='coerce')

        mean = df[c].mean()
        std = df[c].std()

        df[c] = (df[c] - mean) / std
        df[c] = df[c].replace(np.nan, 0)
        
        mean_std_dict[c] = {'mean': mean, 'std': std}

    mean_std_df = pd.DataFrame(mean_std_dict)
    mean_std_df.to_csv(os.path.join(mean_std_path, f'{file_name.replace("csv","")}_mean_std.csv'))

    df.to_csv(os.path.join(out_path, file_name))