In [55]:
import torch
import torch.nn as nn
import os
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
import numpy as np 
import pandas as pd
import datetime
import pickle

In [56]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [57]:
# model_path = os.path.join("/Volumes/GoogleDrive/My Drive/Colab Notebooks/HKJC-ML/model_configs/hkjc5","2023_11_11_15_05_64_2770_026")
# cols_to_keep = ['total_stakes_rank', 'horse_handicap_rank', 'jockey_rides_rank']
# input_size = 25 + len(cols_to_keep) # 26 to 35

In [58]:
class MLP(nn.Module):
    def __init__(self, input_size, layers):
        super(MLP, self).__init__()
        modules = nn.ModuleList([nn.Flatten(), nn.Linear(input_size, layers[0]), nn.ReLU()])
        for idx, size in enumerate(layers[:-1]):
            modules.append(nn.Linear(size, layers[idx + 1]))
            modules.append(nn.ReLU())
        modules.append(nn.Linear(layers[-1], 1))
        self.layers = nn.Sequential(*modules)

    def forward(self, x):
        return self.layers(x)

In [59]:
def load_model (input_size, layers, file):
    model_path = os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML','model_configs','hkjc5', file)
    
    input_size = int(input_size)
    layers = layers.strip('][').split(', ')
    layers_int = []
    for l in layers:
        layers_int.append(int(l))
    # print(input_size, layers_int)
    # print(type(input_size), type(layers_int))
    
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    # print(f"Using {device} device")
    model = MLP(input_size, layers_int).to(device)
    # model = ConvNet(input_size, hidden_size, output_size).to(device)

    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path, map_location="cpu"))
    model.eval()
    return model

In [60]:
def ordinal_normalise (df, cols_to_keep):    
    for ordinal_file in [f for f in os.listdir(os.path.join('data','5_ordinal_mean_tensor','ordinals')) if '.csv' in f]:
        ordinal_dict = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','ordinals',ordinal_file), index_col=0).to_dict(orient='list')
        for key in ordinal_dict:
            ordinal_dict[key] = ordinal_dict[key][0]

        c = ordinal_file.replace('_ordinal.csv','')
        df[c].replace(ordinal_dict, inplace=True)

    cols_to_rank = [str(c).replace('_rank','') for c in cols_to_keep if 'rank' in c]

    for c in cols_to_rank:
        c = c.replace('"','').replace("'",'')
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[f'{c}_rank'] = df[c].rank(method='dense', ascending=False)

    mean_std_df = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','mean_std.csv'), index_col=0)
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[c] = (df[c] - mean_std_df.loc['mean', c]) / mean_std_df.loc['std', c]
        df[c] = df[c].replace(np.nan, mean_std_df.loc['mean', c])
    return df

In [61]:
def key_func (x):
    x = x.replace('.csv','')
    race_no = x.split('_')[-1]
    if len(str(race_no)) == 1:
        race_no = '0' + str(race_no)
    x = '_'.join(x.split('_')[:-1])
    x += str(race_no)
    return int(x)

def evaluate (cols_to_keep, layers, file):
    wins = 0
    total = 0
    money = 0
    money_hv = 0
    money_st = 0
    trios = 0
    # triple_trios = 0
    # triple_trio_counter = 0
    # race_date = 0

    in_path = os.path.join("data","6_prediction",'archive')

    cols_to_drop = ['total_stakes','horse_weight','horse_handicap','horse_odds','horse_rating','horse_import_type','horse_sex','horse_colour',
        'horse_age','horse_origin','horse_gear','days_since_import',
        'draw',
        'jockey_age','jockey_nationality','jockey_wins','jockey_rides','jockey_stakes','jockey_same_race_wins',
        'race_location','race_class','race_going','race_distance','race_surface','race_course',
        'total_stakes_rank','horse_weight_rank','horse_handicap_rank','horse_odds_rank','horse_rating_rank','days_since_import_rank',
        'jockey_age_rank','jockey_rides_rank','jockey_stakes_rank','jockey_same_race_wins_rank']
    cols_to_keep = str(cols_to_keep).strip('][').split(', ')
    if isinstance(cols_to_keep, list):
        for c in cols_to_keep:
            c = "".join(s for s in c if s.isalpha() or s == '_')
            # print(c, cols_to_drop)
            cols_to_drop.remove(c)
    else:
        cols_to_drop.remove(cols_to_keep)

    mean_std_df = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','mean_std.csv'), index_col=0)
    
    for file_name in sorted([f for f in os.listdir(os.path.join(in_path)) if not f.startswith(".")], key=key_func):
        df = pd.read_csv(os.path.join(in_path,file_name), index_col=0)
        race_location = df['race_location'].unique()[0]
        df = ordinal_normalise(df, cols_to_keep)
        df.sort_values('place', inplace=True)
        odds_df = df['horse_odds']
        df.drop(["place"], axis=1, inplace=True)
        # df.drop(["place", "finish_time"], axis=1, inplace=True)
        for c in cols_to_drop:
            if c in df.columns:
                df.drop(c, axis=1, inplace=True)
        
        model = load_model(df.shape[1], layers, file)

        finish_times = []
        for index, data in df.iterrows():
            # if index == 0:
            #     winning_bet_odds = (float(data["horse_odds"]) * means_std.loc["std", "horse_odds"]) + means_std.loc["mean", "horse_odds"]
            data = pd.to_numeric(data)
            input_np = data.to_numpy()
            input = torch.Tensor(input_np).unsqueeze(dim=0)
            finish_times.append(model(input).detach().numpy()[0])
        lost = False
        for finish_time in finish_times[1:]:
            if finish_times[0] > finish_time: # if first not shortest time
                lost = True
                break

        # # check triple trio
        # new_race_date = ''.join(file_name.split('_')[:3])
        # if new_race_date != race_date:
        #     triple_trio_counter = 0

        if all(max(finish_times[:3]) < t for t in finish_times[3:]):
            # print(list(finish_times[:3]), list(finish_times), 'trio')
            # print(file_name, 'trio')
            trios += 1
            # triple_trio_counter += 1
            # if triple_trio_counter == 3:
            #     # print('triple trio')
            #     triple_trios += 1
            #     triple_trio_counter -= 1
        # else:
        #     triple_trio_counter = 0

        if not lost:
            if not finish_times[0] in finish_times[1:]:
            # if not float(finish_times[0]) - float(finish_times[1]) > -0.001:
                # print(file_name, float((odds_df.iloc[0] * mean_std_df.loc['std', 'horse_odds']) + mean_std_df.loc['mean', 'horse_odds'])*10)
                # print(file_name, float(finish_times[1]) - float(finish_times[0]))
                money += float((odds_df.iloc[0] * mean_std_df.loc['std', 'horse_odds']) + mean_std_df.loc['mean', 'horse_odds'])*10
                wins += 1
                if race_location == 'sha tin':
                    money_st += float((odds_df.iloc[0] * mean_std_df.loc['std', 'horse_odds']) + mean_std_df.loc['mean', 'horse_odds'])*10
                elif race_location == 'happy valley':
                    money_hv += float((odds_df.iloc[0] * mean_std_df.loc['std', 'horse_odds']) + mean_std_df.loc['mean', 'horse_odds'])*10
        else:
            money -= 10
            if race_location == 'sha tin':
                money_st -=10
            elif race_location == 'happy valley':
                money_hv -=10
        total += 1
        # print(file, file_name, '{:.2f}'.format(100 * wins/total), '%')
    # return '{:.2f}'.format(100 * wins/total)
    return '{:.2f}'.format(money), '{:.2f}'.format(100 * wins/total), '{:.2f}'.format(money_st), '{:.2f}'.format(money_hv), int(trios)
    

In [62]:
def get_results (driver, target_race_date, target_race_no):
    results_dict = {}

    driver.get("https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx")
    WebDriverWait(driver, 10, 1).until(lambda x: x.find_element(By.CSS_SELECTOR, "option").is_displayed())
    dates = driver.find_elements(By.CSS_SELECTOR, "option")
    num_of_dates = len(dates)

    for date_idx in range(num_of_dates):
        date_dropdown = driver.find_element(by = By.XPATH, value = "//select[@id = 'selectId']")
        date_dropdown.click()
        dates = driver.find_elements(By.CSS_SELECTOR, "option")
        date = dates[date_idx]
        race_date = datetime.datetime.strptime(date.text, '%d/%m/%Y')

        if race_date != target_race_date:
            continue

        date.click()
        submit_button = driver.find_element(By.ID, "submitBtn")
        submit_button.click()
        current_url = driver.current_url
        if not "local" in current_url.lower():
            driver.back()
            continue
        WebDriverWait(driver, 10, 1).until(lambda x: x.find_element(By.XPATH, '//*[@id="innerContent"]/div[2]/div[2]/table/tbody/tr[1]/td').is_displayed())

        race_buttons_indices = []

        for cell_idx, cell in enumerate(driver.find_elements(By.XPATH, '//*[@id="innerContent"]/div[2]/div[2]/table/tbody/tr[1]/td')):
            if len(cell.find_elements(By.TAG_NAME, 'img')) > 0:
                race_buttons_indices.append(cell_idx + 1)

        for race_button_idx in race_buttons_indices[:-1]:
            driver.find_element(By.XPATH, f'//*[@id="innerContent"]/div[2]/div[2]/table/tbody/tr[1]/td[{race_button_idx}]').click()
            WebDriverWait(driver, 10, 1).until(lambda x: x.find_element(By.XPATH, '//*[@id="innerContent"]/div[2]/div[4]').is_displayed())
            time.sleep(0.5)
            if not driver.find_elements(By.XPATH, '//*[@id="innerContent"]/div[2]/div[4]/table'):
                if 'abandoned' in driver.find_element(By.XPATH,'//*[@id="innerContent"]/div[2]/div[4]').text:
                    continue
            else:
                race_no = int(driver.find_element(By.XPATH, '//*[@id="innerContent"]/div[2]/div[4]/table/thead/tr/td[1]').text[5:7].replace(" ",""))
                # print(race_no, target_race_no, int(race_no) == int(target_race_no))
            if int(race_no) != int(target_race_no):
                continue
            
            # horse table
            num_horses = len(driver.find_elements(By.XPATH, "/html/body/div[1]/div[3]/div[2]/div[2]/div[2]/div[5]/table/tbody/tr"))
            for horse_idx in range(num_horses):
                horse_no = driver.find_element(By.XPATH, f'/html/body/div[1]/div[3]/div[2]/div[2]/div[2]/div[5]/table/tbody/tr[{horse_idx + 1}]/td[2]').text
                if horse_no == '':
                    continue
                place = driver.find_element(By.XPATH, f'/html/body/div[1]/div[3]/div[2]/div[2]/div[2]/div[5]/table/tbody/tr[{horse_idx + 1}]/td[1]').text
                results_dict[horse_no] = place
            return results_dict

In [63]:
new_raceday = False

driver = Chrome()

In [64]:
in_path = os.path.join('data','6_prediction','archive')

for file_name in [f.replace('.csv','') for f in os.listdir(in_path) if '.csv' in f]:
    df = pd.read_csv(os.path.join(in_path, f'{file_name}.csv'), index_col=0)
    if 'place' in df.columns:
        continue

    new_raceday = True
    race_date = datetime.datetime.strptime('/'.join(file_name.split('_')[:3]), '%Y/%m/%d')
    race_no = file_name.split('_')[-1]

    results = get_results (driver, race_date, race_no)
    results = {int(k): v for k, v in results.items()}
    results = dict(sorted(results.items()))
    results = list(results.values())

    if len(results) > len(df.index):
        results = [ x for x in results if x.strip().isdigit() ]
        # results.remove('')

    # try:
    df['place'] = results
    # except:
    #     results = {k: v for k, v in results.items() if v != ''}
    #     df['place'] = results
    df = df[df['place'].notna()]
    df.to_csv(os.path.join(in_path, f'{file_name}.csv'))
    print(file_name, 'updated with place')

driver.close()

In [66]:
in_path = os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML/hkjc5')
out_path = os.path.join('data','5_ordinal_mean_tensor')

in_df = pd.read_csv(os.path.join(in_path,'model_names.csv'), index_col=0)
if not new_raceday:
    out_df = pd.read_csv(os.path.join(out_path,'model_names.csv'), index_col=0)
else:
    out_df = pd.DataFrame({'cols_kept':[],'layers':[],'file':[],'score':[],'money':[],'money_st':[],'money_hv':[],'trios':[]})
# model_names = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','model_names.csv'), index_col=0)
# dict = {'cols_kept':[],'layers':[],'file':[]}
scores = []

print('{:<70} {:<7} {:<7} {:}'.format('file','st','hv','trios'))

for idx, row in in_df.iterrows():
    # print(out_df[(out_df['cols_kept'] == row['cols_kept']) & (out_df['layers'] == row['layers'])])

    # if row['file'] != '2023_11_30_09_05_32_64_128_256_512_256_128_64_32_16_8_64_255_723':
    # if len(out_df[(out_df['cols_kept'] == row['cols_kept']) & (out_df['layers'] == row['layers'])]) > 0:
    if row['file'] in out_df['file'].unique():
        continue

    money, score, money_st, money_hv, trios = evaluate(row['cols_kept'], row['layers'], row['file'])

    # # scores.append(score)
    print('{:<70} ${:<7} ${:<7} {:d}'.format(row['file'], money_st, money_hv, trios))

    if float(score) == 0 or (float(money_st) <= 0 and float(money_hv) <= 0):
        in_df.drop(idx, axis=0, inplace=True)
        in_df.to_csv(os.path.join(in_path,'model_names.csv'))
        continue

    out_df.loc[-1] = [row['cols_kept'], row['layers'], row['file'], score, money, money_st, money_hv, trios]
    out_df['score'] = pd.to_numeric(out_df['score'])
    out_df['money'] = pd.to_numeric(out_df['money'])
    # out_df = out_df[out_df['score'] != 0]
    out_df.sort_values('money', ascending=False, inplace=True)
    out_df.reset_index(drop=True, inplace=True)
    out_df.to_csv(os.path.join(out_path,'model_names.csv'))
    out_df.to_csv(os.path.join(in_path,'model_names_scored.csv'))

new_raceday = False

file                                                                   st      hv      trios


ValueError: invalid literal for int() with base 10: '  16   32   64  128  256  512 1024 1024  512  256  128   64   32   16\n    8'

In [None]:
num_to_keep = 20 # top models for each scoring

In [None]:
in_path = os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML')
out_path = os.path.join('data','5_ordinal_mean_tensor')

in_df = pd.read_csv(os.path.join(in_path,'model_names.csv'), index_col=0)
out_df = pd.read_csv(os.path.join(out_path,'model_names.csv'), index_col=0)

filtered_out_df = pd.DataFrame()
filter_cols = ['money','money_st','money_hv','trios']

quantile = min([num_to_keep/len(out_df.index), 1]) # above which to keep
for col in filter_cols:
    # filtered_out_df = pd.concat([filtered_out_df, out_df[out_df[col] > out_df[col].quantile(quantile)]])
    filtered_out_df = pd.concat([filtered_out_df, out_df.sort_values(col, ascending=False).head(num_to_keep)])
    # print(out_df[col].quantile(quantile))

filtered_out_df.drop_duplicates('file', inplace=True)
filtered_in_df = filtered_out_df[['cols_kept','layers','file']]

filtered_out_df['money'] = pd.to_numeric(filtered_out_df['money'])
filtered_out_df.sort_values('money', ascending=False, inplace=True)
filtered_out_df.reset_index(drop=True, inplace=True)

# save model csvs
filtered_out_df.to_csv(os.path.join(out_path,'model_names.csv'))
filtered_out_df.to_csv(os.path.join(in_path,'model_names_scored.csv'))
filtered_in_df.to_csv(os.path.join(in_path,'model_names.csv'))

# delete model config files
for file in os.listdir('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML/model_configs/hkjc5'):
    if file not in filtered_out_df['file'].unique():
        # print(file)
        os.remove(os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML/model_configs/hkjc5', file))


In [None]:
# df = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','model_names.csv'), index_col=0)
df = pd.read_csv(os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML','model_names_scored.csv'), index_col=0)


df.sort_values('money_st', ascending=False, inplace=True)
df.head(3)

Unnamed: 0,cols_kept,layers,file,score,money,money_st,money_hv,trios
75,"['horse_import_type', 'jockey_nationality', 'h...","[32, 64, 128, 256, 512, 256, 128, 64, 32, 16, 8]",2023_11_30_09_05_32_64_128_256_512_256_128_64_...,17.3,965.0,1076.0,-111.0,2
9,"['race_going', 'jockey_nationality', 'jockey_r...","[64, 128, 256, 512, 512, 256, 128, 64, 32, 16, 8]",2024_01_03_18_57_64_128_256_512_512_256_128_64...,13.51,1331.0,1029.0,302.0,2
151,"['horse_gear', 'race_going', 'total_stakes_ran...",[16 32 64 64 32 16 8 4],2024_01_13_09_10_16_32_64_64_32_16_8_4_64_564_...,16.75,718.0,1013.0,-295.0,0


In [None]:
df.sort_values('money_hv', ascending=False, inplace=True)
df.head(3)

Unnamed: 0,cols_kept,layers,file,score,money,money_st,money_hv,trios
4,"['horse_weight_rank', 'horse_handicap', 'race_...",[32 64 64 64 32 16 8],2024_01_13_02_44_32_64_64_64_32_16_8_64_260_854,19.21,1517.0,-172.0,1689.0,1
1,"['horse_weight_rank', 'horse_handicap', 'race_...",[32 64 64 64 32 16 8],2024_01_13_02_44_32_64_64_64_32_16_8_64_260_854,20.0,1555.0,-72.0,1627.0,1
5,"['horse_weight_rank', 'horse_handicap', 'race_...",[32 64 64 64 32 16 8],2024_01_13_02_44_32_64_64_64_32_16_8_64_260_854,18.97,1455.0,-172.0,1627.0,1


In [None]:
df.sort_values('trios', ascending=False, inplace=True)
df.head(3)

Unnamed: 0,cols_kept,layers,file,score,money,money_st,money_hv,trios
783,"['total_stakes', 'jockey_rides', 'total_stakes...","[64, 128, 256, 256, 128, 64, 32, 16, 8]",2024_01_06_18_54_64_128_256_256_128_64_32_16_8...,17.24,62.0,-214.0,276.0,12
687,"['total_stakes', 'jockey_rides', 'total_stakes...","[64, 128, 256, 256, 128, 64, 32, 16, 8]",2024_01_06_18_54_64_128_256_256_128_64_32_16_8...,17.95,142.0,-214.0,356.0,11
517,"['total_stakes', 'jockey_rides', 'total_stakes...","[64, 128, 256, 128, 64, 32, 16, 8]",2024_01_06_11_45_64_128_256_128_64_32_16_8_64_...,18.23,283.0,181.0,102.0,11
