In [2]:
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import numpy as np
import os
import torch
import pickle
import datetime
import time



In [25]:
def extract_columns (df, columns):
    new_df = pd.DataFrame()
    for c in columns:
        if c in df.columns:
            new_df[c] = df[c]
        else:
            new_df[c] = np.nan
    return new_df

In [None]:
in_path = os.path.join("data","2_cleaned",'horses')
out_path = os.path.join("data","3_extracted")

for file_name in [f for f in os.listdir(in_path) if not f.startswith(".")]:

    # try:
    #     df = pd.read_csv(os.path.join(out_path, file_name), index_col=0)
    #     if 'jockey_age' in df.columns:
    #         continue
    # except:
    #     pass

    horse_df = pd.read_csv(os.path.join(in_path, file_name), index_col=0)
    if 'horse_finish_time' not in horse_df.columns:
        continue
    for c in horse_df.columns:
        horse_df[c] = horse_df[c].apply(lambda x: str(x).lower())
    horse_df['finish_time'] = horse_df['horse_finish_time']
    horse_df['race_date'] = pd.to_datetime(horse_df['race_date'], format='%Y/%m/%d', errors='coerce')
    horse_df['race_index'] = horse_df['race_date'].apply(lambda x: x.strftime('%Y_')) + pd.to_numeric(horse_df['Race Index']).astype(int).astype(str)
    horse_df.drop_duplicates('race_date', inplace=True)
    horse_df = horse_df[horse_df['horse_finish_time'] != '--']
    horse_df.reset_index(drop=True, inplace=True)
    if 'Import Date' in horse_df.columns:
        horse_df['Import Date'] = pd.to_datetime(horse_df['Import Date'], format='%d/%m/%Y', errors='coerce')
        horse_df['days_since_import'] = (horse_df['race_date'] - horse_df['Import Date']).apply(lambda x: x.days)
    else:
        earliest_race_date = horse_df['race_date'].min()
        horse_df['days_since_import'] = (horse_df['race_date'] - earliest_race_date).apply(lambda x: x.days + 177)
    horse_df['horse_gear'] = (horse_df['Gear'] != '--').astype(int)
    horse_df['total_stakes'] = horse_df['Total Stakes*'].apply(lambda x: x.replace('$','').replace(',',''))

    jockey_age = []
    jockey_nationality = []
    jockey_wins = []
    jockey_rides = []
    jockey_stakes = []
    jockey_same_race_wins = []

    for idx, row in horse_df.iterrows():
        jockey_file_path = os.path.join('data','1_scrape','jockeys',f'{row["Jockey"]}.csv')
        if not os.path.exists(jockey_file_path):
            jockey_age.append(np.nan)
            jockey_nationality.append(np.nan)
            jockey_wins.append(np.nan)
            jockey_rides.append(np.nan)
            jockey_stakes.append(np.nan)
            jockey_same_race_wins.append(np.nan)
            continue
        jockey_df = pd.read_csv(jockey_file_path, index_col=0)
        try:
            jockey_age.append(int(jockey_df['jockey_age'].unique()[0]) - (2023 - int(row['race_date'].year)))
        except:
            jockey_age.append(np.nan)
        jockey_nationality.append(jockey_df['nationality'].unique()[0])
        jockey_wins.append(jockey_df['no. of wins'].unique()[0])
        jockey_rides.append(jockey_df['total rides'].unique()[0])
        jockey_stakes.append(str(jockey_df['stakes won'].unique()[0]).replace('$','').replace(',','').strip())

        race_distance = str(row["race_distance"]).replace('.0','')
        if 'sha tin' in row["race_location"]:
            try:
                jockey_same_race_wins.append(jockey_df[f'{row["race_surface"]}_{race_distance}_win'].unique()[0])
            except:
                jockey_same_race_wins.append(jockey_df[f'sha tin_all weather_{race_distance}_win'].unique()[0])
        else:
            jockey_same_race_wins.append(jockey_df[f'{row["race_location"].replace("ch","conghua")}_{race_distance}_win'].unique()[0])

    horse_df['jockey_age'] = jockey_age
    horse_df['jockey_nationality'] = jockey_nationality
    horse_df['jockey_wins'] = jockey_wins
    horse_df['jockey_rides'] = jockey_rides
    horse_df['jockey_stakes'] = jockey_stakes
    horse_df['jockey_same_race_wins'] = jockey_same_race_wins

    df = extract_columns(horse_df,['race_index','total_stakes','horse_weight','horse_handicap','horse_odds','horse_rating','horse_import_type','horse_sex','horse_colour',
                'horse_age','horse_origin','horse_gear','days_since_import',
                'draw',
                'jockey_age','jockey_nationality','jockey_wins','jockey_rides','jockey_stakes','jockey_same_race_wins',
                'race_location','race_class','race_going','race_distance','race_surface','race_course',
                'finish_time','place'])

    df.to_csv(os.path.join(out_path, file_name))
    print(file_name, 'saved')
    

In [8]:
in_path = os.path.join("data","3_extracted")

entire_df = pd.DataFrame()

for file_name in [f for f in os.listdir(in_path) if not f.startswith(".")]:

    file = os.path.join(in_path, file_name)
    current_race_df = pd.read_csv(file, index_col=0)

    entire_df = pd.concat([entire_df, current_race_df])


In [None]:
out_path = os.path.join("data","4_races")

for race_index in entire_df['race_index'].unique():
    df = entire_df[entire_df['race_index'] == race_index]
    # if len(df.index) < 5:
    #     continue
    df.sort_values('place', inplace=True)

    cols_to_rank = ['total_stakes','horse_weight','horse_handicap','horse_odds','horse_rating','days_since_import',
        'jockey_age','jockey_rides','jockey_stakes','jockey_same_race_wins']

    for c in cols_to_rank:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[f'{c}_rank'] = df[c].rank(method='dense', ascending=False)

    df.to_csv(os.path.join(out_path,f'{race_index}.csv'))

In [10]:
in_path = os.path.join("data","4_races")
out_path = os.path.join("data","5_ordinal_mean_tensor")

entire_df = pd.DataFrame()

mean_std_dict = {}

for file_name in [f for f in os.listdir(in_path) if not f.startswith(".")]:

    file = os.path.join(in_path, file_name)
    current_race_df = pd.read_csv(file, index_col=0)

    entire_df = pd.concat([entire_df, current_race_df])

entire_df['race_class'] = entire_df['race_class'].replace(['g1','g2','g3','3r','4r'],[1,2,3,3,4])

ordinal_cols = ['horse_origin','horse_colour','horse_sex','horse_import_type',
                'race_location','race_class','race_surface','race_course','race_going',
                'jockey_nationality']

# for c in entire_df.columns:
#     print(c, entire_df[c].unique())

for c in ordinal_cols:
    ordinal_dict = {}
    entire_df[c] = entire_df[c].astype(str)
    for v in entire_df[c].unique():
        if v in ['','nan']:
            continue
        ordinal_dict[v] = len(entire_df.loc[(entire_df[c] == v) & (entire_df['place'] == 1)])
    ordinal_dict = {key: rank for rank, key in enumerate(sorted(ordinal_dict, key=ordinal_dict.get, reverse=True), 1)}

    entire_df[c].replace(ordinal_dict, inplace=True)
    ordinal_df = pd.DataFrame(ordinal_dict, index=[0])
    ordinal_df.to_csv(os.path.join(out_path, 'ordinals', f'{c}_ordinal.csv'))

for c in entire_df.columns:
    if c == 'race_index':
        continue
    entire_df[c] = pd.to_numeric(entire_df[c], errors='coerce')

    mean = entire_df[c].mean()
    std = entire_df[c].std()

    entire_df[c] = (entire_df[c] - mean) / std
    entire_df[c] = entire_df[c].replace(np.nan, 0)
    
    mean_std_dict[c] = {'mean': mean, 'std': std}

mean_std_df = pd.DataFrame(mean_std_dict)
mean_std_df.to_csv(os.path.join(out_path, 'mean_std.csv'))

x_df = entire_df.drop(['race_index','place','finish_time'], axis=1)

# finish time in seconds
y_df = entire_df['finish_time']
x = x_df.to_numpy()
y = y_df.to_numpy()
x_tensor = torch.from_numpy(x)
y_tensor = torch.from_numpy(y)
torch.save(x_tensor, os.path.join(out_path, 'finish_time', "x_tensor"))
torch.save(y_tensor, os.path.join(out_path, 'finish_time', "y_tensor"))


In [11]:
in_path = os.path.join("data","4_races")
out_path = os.path.join("data","5_ordinal_mean_tensor")

for file_name in [f for f in os.listdir(in_path) if not f.startswith(".")]:
    df = pd.read_csv(os.path.join(in_path, file_name), index_col=0)
    df.drop('race_index', axis=1, inplace=True)

    for ordinal_file in [f for f in os.listdir(os.path.join('data','5_ordinal_mean_tensor','ordinals')) if '.csv' in f]:
        ordinal_dict = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','ordinals',ordinal_file), index_col=0).to_dict(orient='list')
        for key in ordinal_dict:
            ordinal_dict[key] = ordinal_dict[key][0]
        c = ordinal_file.replace('_ordinal.csv','')
        df[c].replace(ordinal_dict, inplace=True)

    mean_std_df = pd.read_csv(os.path.join('data','5_ordinal_mean_tensor','mean_std.csv'), index_col=0)
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[c] = (df[c] - mean_std_df.loc['mean', c]) / mean_std_df.loc['std', c]
        df[c] = df[c].replace(np.nan, mean_std_df.loc['mean', c])

    df.to_csv(os.path.join(out_path, 'evaluation', file_name))

In [4]:
in_path = os.path.join('data','5_ordinal_mean_tensor','evaluation')
out_path = os.path.join('data','5_ordinal_mean_tensor')

entire_df = pd.DataFrame()

for file_name in [f for f in os.listdir(in_path) if not f.startswith(".")]:
    df = pd.read_csv(os.path.join(in_path, file_name), index_col=0)

    entire_df = pd.concat([entire_df, df])

entire_df.to_csv(os.path.join(out_path, 'all_races.csv'))

In [12]:
in_path = os.path.join("data","5_ordinal_mean_tensor")

x_tensor = torch.load(os.path.join(in_path, 'finish_time', "x_tensor"))
y_tensor = torch.load(os.path.join(in_path, 'finish_time', "y_tensor"))

x_np = x_tensor.numpy()
y_np = y_tensor.numpy()

print(np.isnan(x_np).any())
print(np.isnan(y_np).any())


False
False
