In [48]:
import torch
import torch.nn as nn
import os

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
import numpy as np 
import pandas as pd
import datetime
import pickle
import time

In [49]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [50]:
class MLP(nn.Module):
    def __init__(self, input_size, layers):
        super(MLP, self).__init__()
        modules = nn.ModuleList([nn.Flatten(), nn.Linear(input_size, layers[0]), nn.ReLU()])
        for idx, size in enumerate(layers[:-1]):
            modules.append(nn.Linear(size, layers[idx + 1]))
            modules.append(nn.ReLU())
        modules.append(nn.Linear(layers[-1], 1))
        self.layers = nn.Sequential(*modules)

    def forward(self, x):
        return self.layers(x)

In [51]:
def load_model (input_size, layers, file):
    model_path = os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML/hkjc5/model_configs/hkjc5', file)
    
    input_size = int(input_size)
    # layers = layers.strip('][').split(', ')
    layers = layers.strip('][').replace('\n', ',').replace(' ',',').split(',')
    layers = list(filter(None, layers))
    
    layers_int = []
    for l in layers:
        layers_int.append(int(l))
    # print(input_size, layers_int)
    # print(type(input_size), type(layers_int))
    
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    # print(f"Using {device} device")
    model = MLP(input_size, layers_int).to(device)
    # model = ConvNet(input_size, hidden_size, output_size).to(device)

    model.load_state_dict(torch.load(model_path, map_location="cpu"))
    model.eval()
    print(file, 'loaded')
    return model

In [52]:
# models_df = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','model_names.csv'), index_col=0)
models_df = pd.read_csv(os.path.join('/Users/alexlee/My Drive/Colab Notebooks/HKJC-ML/hkjc5','model_names_scored.csv'), index_col=0)

# df = models_df
# df = models_df[models_df['score'] < 65]
# df.sort_values('money', ascending=False, inplace=True)
# df = df.loc[df['file'] == '2023_11_14_10_26_32_64_128_256_512_256_128_64_32_16_8_4_64_448_0588']

# models_df.sort_values('money_hv', ascending=False, inplace=True)
# cols_to_keep = str(models_df.iloc[0]['cols_kept']).strip('][').replace('"','').replace("'",'').split(', ')
# layers = models_df.iloc[0]['layers']
# file = models_df.iloc[0]['file']

# model = load_model(len(cols_to_keep), layers, file)

# for idx, row in df.iterrows():
#     if row['file'] != '2023_11_28_19_31_64_32_16_8_64_348_0261':
#         continue
#     cols_to_keep = str(row['cols_kept']).strip('][').replace('"','').replace("'",'').split(', ')
#     layers = row['layers']
#     file = row['file']

#     model = load_model(len(cols_to_keep), layers, file)

In [53]:
# state_dict = torch.load(model_path, map_location="cpu")
# # Define the input and output sizes
# input_size = 25
# output_size = 1

# model = MLP(input_size, output_size).to(device)
# # model = ConvNet(input_size, hidden_size, output_size).to(device)

# model.load_state_dict(torch.load(model_path, map_location="cpu"))
# model.eval()

In [54]:
def key_func (x):
    x = x.replace('.csv','')
    race_no = x.split('_')[-1]
    if len(str(race_no)) == 1:
        race_no = '0' + str(race_no)
    x = '_'.join(x.split('_')[:-1])
    x += str(race_no)
    return int(x)

in_path = os.path.join('data','6_prediction')

# for file_name in sorted([f for f in os.listdir(in_path) if '.csv' in f], key=lambda x: int(x.split('_')[-1].replace('.csv',''))):
for file_name in sorted([f for f in os.listdir(in_path) if '.csv' in f], key=key_func):
    df = pd.read_csv(os.path.join(in_path, file_name), index_col=0)
    race_location = df['race_location'].unique()[0]

    if race_location == 'happy valley':
        models_df.sort_values('money_hv', ascending=False, inplace=True)
        cols_to_keep = str(models_df.iloc[0]['cols_kept']).strip('][').replace('"','').replace("'",'').split(', ')
        layers = models_df.iloc[0]['layers']
        file = models_df.iloc[0]['file']

    elif race_location == 'sha tin':
        models_df.sort_values('money_st', ascending=False, inplace=True)
        cols_to_keep = str(models_df.iloc[1]['cols_kept']).strip('][').replace('"','').replace("'",'').split(', ')
        layers = models_df.iloc[1]['layers']
        file = models_df.iloc[1]['file']
        
    model = load_model(len(cols_to_keep), layers, file)
    break

for file_name in sorted([f for f in os.listdir(in_path) if '.csv' in f], key=key_func):

    df = pd.read_csv(os.path.join(in_path, file_name), index_col=0)
    if 'place' in df.columns:
        df.drop('place', axis=1, inplace=True)

    for ordinal_file in [f for f in os.listdir(os.path.join('data','5_ordinal_mean_tensor','ordinals')) if '.csv' in f]:
        ordinal_dict = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','ordinals',ordinal_file), index_col=0).to_dict(orient='list')
        for key in ordinal_dict:
            ordinal_dict[key] = ordinal_dict[key][0]

        c = ordinal_file.replace('_ordinal.csv','')
        df[c].replace(ordinal_dict, inplace=True)

    cols_to_rank = [str(c).replace('_rank','') for c in cols_to_keep if 'rank' in c]

    for c in cols_to_rank:
        c = c.replace('"','').replace("'",'')
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[f'{c}_rank'] = df[c].rank(method='dense', ascending=False)

    mean_std_df = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','mean_std.csv'), index_col=0)
    for c in df.columns:
        if c not in cols_to_keep:
            df.drop(c, axis=1, inplace=True)
            continue
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[c] = (df[c] - mean_std_df.loc['mean', c]) / mean_std_df.loc['std', c]
        df[c] = df[c].replace(np.nan, mean_std_df.loc['mean', c])

    predicted_race = {} 
    for index, row in df.iterrows():
        input_np = row.to_numpy()
        input = torch.Tensor(input_np).unsqueeze(dim=0)
        predicted_race[index + 1] = (model(input).detach().numpy()[0])[0]
    
    pd.DataFrame(predicted_race, index=[0]).transpose().to_csv(os.path.join('data','7_predicted',file_name))

    print(file_name.replace('.csv',''), [k for k, v in sorted(predicted_race.items(), key=lambda item: item[1])])
    

2024_01_03_18_57_64_128_256_512_512_256_128_64_32_16_8_64_305_00334 loaded


2024_01_28_1 [7, 9, 5, 1, 4, 2, 3, 10, 6, 8]
2024_01_28_2 [6, 8, 10, 12, 13, 2, 4, 5, 7, 3, 1, 9, 11, 14]
2024_01_28_3 [5, 1, 7, 14, 8, 9, 3, 2, 10, 6, 12, 4, 13, 11]
2024_01_28_4 [2, 8, 13, 6, 5, 7, 9, 14, 11, 10, 12, 3, 4, 1]
2024_01_28_5 [7, 4, 3, 1, 5, 10, 8, 9, 13, 14, 2, 12, 6, 11]
2024_01_28_6 [1, 12, 8, 2, 6, 14, 7, 11, 5, 4, 9, 13, 3, 10]
2024_01_28_7 [1, 7, 4, 9, 8, 5, 2, 3, 12, 10, 6, 13, 14, 11]
2024_01_28_8 [5, 6, 2, 1, 4, 12, 10, 3, 8, 7, 9, 11, 13]
2024_01_28_9 [2, 5, 6, 7, 14, 8, 13, 1, 12, 3, 11, 9, 4, 10]
2024_01_28_10 [11, 1, 10, 6, 3, 2, 12, 7, 8, 9, 4, 14, 13, 5]
