# Libraries

In [34]:
import joblib
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Prepare Data

In [35]:
# Load from pkl
xall, yall, harai = joblib.load("./data.pkl")

In [36]:
# Input features
xall

Unnamed: 0,raceid,horseid,racedate,futan,umaban,wakuban,blinker,age,bataijyu,zogen,...,grade_5,wintime_5,lap_s3_5,lap_s4_5,lap_l3_5,lap_l4_5,f,ff,m,mf
0,2020010506010101,2017103291,20200105,54.0,1,1,0,3.0,432.0,4.0,...,,,,,,,1.120002e+09,1.120002e+09,1.220057e+09,1.120002e+09
1,2020010506010101,2017101861,20200105,54.0,2,1,0,3.0,424.0,-4.0,...,,,,,,,1.120002e+09,1.140004e+09,1.220063e+09,1.120002e+09
2,2020010506010101,2017103186,20200105,54.0,3,2,0,3.0,458.0,0.0,...,,,,,,,1.120002e+09,1.120002e+09,1.220061e+09,1.140006e+09
3,2020010506010101,2017102095,20200105,53.0,4,2,0,3.0,464.0,10.0,...,,,,,,,1.120002e+09,1.120002e+09,1.220062e+09,1.120002e+09
4,2020010506010101,2017103287,20200105,52.0,5,3,1,3.0,460.0,-2.0,...,,,,,,,1.120002e+09,1.140006e+09,1.220062e+09,1.140005e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143318,2022122809060912,2018105223,20221228,53.0,12,6,0,4.0,452.0,0.0,...,E,69.4,33.6,45.3,35.8,46.9,1.140007e+09,1.140005e+09,1.220065e+09,1.140006e+09
143319,2022122809060912,2017106137,20221228,56.0,13,7,0,5.0,492.0,6.0,...,E,67.3,32.8,44.1,34.5,45.4,1.120002e+09,1.140004e+09,1.220050e+09,1.120002e+09
143320,2022122809060912,2017100461,20221228,56.0,14,7,0,5.0,504.0,4.0,...,E,80.4,35.6,47.1,33.3,44.8,1.120002e+09,1.140004e+09,1.220055e+09,1.120002e+09
143321,2022122809060912,2018106389,20221228,52.0,15,8,0,4.0,434.0,2.0,...,E,68.4,33.7,45.0,34.7,45.7,1.120002e+09,1.140005e+09,1.220063e+09,1.120002e+09


In [37]:
# # Output feature: finishing position (0: race not finished)
yall

0          2
1         12
2          3
3         11
4          8
          ..
143318    12
143319    16
143320    11
143321     7
143322     8
Name: jyuni, Length: 143323, dtype: int64

In [38]:
# Payout
harai

Unnamed: 0,RaceID,TorokuTosu,SyussoTosu,FuseirituFlag,TokubaraiFlag,HenkanFlag,HenkanUma,HenkanWaku,HenkanDoWaku,PayTansyo,...,PayReserved1,PayUmatan,PaySanrenpuku,PaySanrentan,Year,MonthDay,JyoCD,Kaiji,Nichiji,RaceNum
0,2020010606010203,16,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '09', 'Pay': '000000200', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '0913', 'Pay': '000000510', 'Ninki':...","[{'Kumi': '080913', 'Pay': '000001270', 'Ninki...","[{'Kumi': '091308', 'Pay': '000003840', 'Ninki...",2020,0106,06,01,02,03
1,2022010507010112,16,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '13', 'Pay': '000001530', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '1315', 'Pay': '000004070', 'Ninki':...","[{'Kumi': '061315', 'Pay': '000002720', 'Ninki...","[{'Kumi': '131506', 'Pay': '000026590', 'Ninki...",2022,0105,07,01,01,12
2,2020010506010101,16,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '07', 'Pay': '000000360', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '0701', 'Pay': '000003650', 'Ninki':...","[{'Kumi': '010307', 'Pay': '000098210', 'Ninki...","[{'Kumi': '070103', 'Pay': '000280650', 'Ninki...",2020,0105,06,01,01,01
3,2020010606010204,16,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '12', 'Pay': '000000280', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '1211', 'Pay': '000003040', 'Ninki':...","[{'Kumi': '031112', 'Pay': '000000920', 'Ninki...","[{'Kumi': '121103', 'Pay': '000008340', 'Ninki...",2020,0106,06,01,02,04
4,2020010606010205,16,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '07', 'Pay': '000000380', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '0716', 'Pay': '000001730', 'Ninki':...","[{'Kumi': '070916', 'Pay': '000002130', 'Ninki...","[{'Kumi': '071609', 'Pay': '000009690', 'Ninki...",2020,0106,06,01,02,05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10363,2022122806050903,16,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '01', 'Pay': '000000600', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '0107', 'Pay': '000004600', 'Ninki':...","[{'Kumi': '010307', 'Pay': '000023600', 'Ninki...","[{'Kumi': '010703', 'Pay': '000094870', 'Ninki...",2022,1228,06,05,09,03
10364,2022122806050902,16,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '04', 'Pay': '000000750', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '0406', 'Pay': '000030980', 'Ninki':...","[{'Kumi': '040611', 'Pay': '000013110', 'Ninki...","[{'Kumi': '040611', 'Pay': '000138570', 'Ninki...",2022,1228,06,05,09,02
10365,2022122809060911,13,13,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '04', 'Pay': '000000520', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '0410', 'Pay': '000006990', 'Ninki':...","[{'Kumi': '041012', 'Pay': '000004590', 'Ninki...","[{'Kumi': '041012', 'Pay': '000027420', 'Ninki...",2022,1228,09,06,09,11
10366,2022122806050911,18,18,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[{'Umaban': '11', 'Pay': '000009060', 'Ninki':...",...,"[{'Kumi': ' ', 'Pay': ' ', 'Ninki':...","[{'Kumi': '1108', 'Pay': '000175230', 'Ninki':...","[{'Kumi': '081115', 'Pay': '000232970', 'Ninki...","[{'Kumi': '110815', 'Pay': '002466010', 'Ninki...",2022,1228,06,05,09,11


In [39]:
'''
Preprocess data
'''

# Convert string values to float
for col in xall.columns:
    if xall[col].dtype == "object":
        # Convert to numeric where possible
        xall[col] = pd.to_numeric(xall[col], errors='coerce')
        # Replace NaN (from non-numeric values) with 0
        xall[col] = xall[col].fillna(0)
        print(f"Converted column: {col}")

Converted column: KisyuCode
Converted column: TozaiCD
Converted column: ChokyosiCode
Converted column: BanusiCode
Converted column: course
Converted column: cls
Converted column: grade
Converted column: BreederCode
Converted column: KisyuCode_1
Converted column: course_1
Converted column: cls_1
Converted column: grade_1
Converted column: KisyuCode_2
Converted column: course_2
Converted column: cls_2
Converted column: grade_2
Converted column: KisyuCode_3
Converted column: course_3
Converted column: cls_3
Converted column: grade_3
Converted column: KisyuCode_4
Converted column: course_4
Converted column: cls_4
Converted column: grade_4
Converted column: KisyuCode_5
Converted column: course_5
Converted column: cls_5
Converted column: grade_5


In [40]:
'''
Split into Train-Test data
'''
# Number of days to use of testing
TEST_DAYS = 90

# Date at which the data splits
rd = xall["racedate"].unique()[-TEST_DAYS] 

# split data
xtrain = xall[xall['racedate']<rd].reset_index(drop=True).copy()
ytrain = yall[xtrain.index].reset_index(drop=True).copy()
xtest = xall[xall['racedate']>=rd].reset_index(drop=True).copy()
ytest = yall[xtest.index].reset_index(drop=True).copy()

In [41]:
print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

(104436, 224) (104436,) (38887, 224) (38887,)


# Winner prediction

## Models

In [None]:
model_prediction_winner = {}

### Light Gradient Boosting Machine (Binary)

In [10]:
# Binarize outputs (simplify ranking to win / lose)
ytrain_lgb = ytrain.apply(lambda x: 1 if x==1 else 0)
ytest_lgb = ytest.apply(lambda x: 1 if x==1 else 0)

In [None]:
# Training parameters
num_boost_round = 100
early_stopping_round = 10
lgb_train_parameters = {
 'max_depth': 10,
 'min_data_in_leaf': 50,
 'learning_rate': 0.01,
 'seed': 1,
 'objective': 'binary',
 'metric': 'binary_logloss',
}

In [12]:
# Validation data (training_data=validation_data）
xtrain_val = xtrain.copy()
ytrain_val_lgb = ytrain_lgb.copy()
xtest_val = xtrain.copy()
ytest_val_lgb = ytrain_lgb.copy()

In [13]:
# Prepare dataset for LGB model
train_data_lgb = lgb.Dataset(xtrain_val, label=ytrain_val_lgb) 
valid_data_lgb = lgb.Dataset(xtest_val, label=ytest_val_lgb)

In [None]:
# Train a LGB model on our dataset
gbm = lgb.train(lgb_train_parameters, 
                train_data_lgb,
                valid_sets=[valid_data_lgb],
                num_boost_round=num_boost_round,
                callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_round, 
                            verbose=1)]
               )

[LightGBM] [Info] Number of positive: 7532, number of negative: 96904
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17338
[LightGBM] [Info] Number of data points in the train set: 104436, number of used features: 212
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.072121 -> initscore=-2.554560
[LightGBM] [Info] Start training from score -2.554560
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.210151


In [None]:
# Predict
ypred = gbm.predict(xtest)
model_prediction_winner['LightGBM'] = ypred

### XGBoost (Binary)

In [58]:
xgb_train_param = {
    "n_estimators":2,
    "max_depth":2,
    "learning_rate":1,
    "objective":'binary:logistic'
}

In [None]:
bst = xgb.XGBClassifier(xgb_train_param)
# fit model
bst.fit(xtrain, ytrain)
# make predictions
ypred = bst.predict(xtest)
model_prediction_winner['XGBoost'] = ypred



### Catboost (Binary)

### Random Forest Classifier

### Logistic Regression

## Evaluation

### Tansyo

In [49]:
def calc_mrr(ypred, ytrue):
    '''
    Args:
        ypred (numpy.array): predicted ranks (1 or higher) [n, MAX_HORSE]
        ytrue (numpy.array): final confirmed ranks [n, MAX_HORSE]
    
    Returns:
        mrr (int)
    '''
    res = []
    for i in range(ytrue.shape[0]):
        # Which rank did we predict for the horse that finished 1st?
        # If there is a tie, take the smallest value
        indices = np.where(ytrue[i, :] == 1) 
        if indices[0].size > 0:  # There may be races without a 1st-place horse before the start
            res.append(1 / ypred[i, np.min(indices)])
    return np.mean(res)


In [48]:
def get_reports(df_res):
    """
    Summary: 
        Outputs a report of the win predictions.

    Args:
        df_res (pd.DataFrame): Aggregated results from get_win_results

    Returns:
        None

    Note:
        Races with refunds are excluded from the aggregation
    """

    # Hit rate (accuracy)
    tekichu = df_res.apply(lambda x: sum([y in x["true"] for y in x["pred"]]) >= 1, axis=1).astype(int).values
    print("Number of races", df_res.shape[0])
    print("Number of hits", np.sum(tekichu))
    print("Hit rate", np.mean(tekichu), np.std(tekichu, ddof=1), np.std(tekichu, ddof=1) / np.sqrt(df_res.shape[0]))

    # Return rate (profitability)
    modoshi = df_res.apply(lambda x: sum([x["pay"][y] if y in x["true"] else 0 for y in x["pred"]]) + x["henkan"]*100, axis=1).values
    harai = df_res.apply(lambda x: len(x["pred"]), axis=1).values * 100
    print("Total payout", np.sum(modoshi))
    print("Return rate", np.mean(modoshi/harai), np.std(modoshi/harai, ddof=1), np.std(modoshi/harai, ddof=1) / np.sqrt(df_res.shape[0]))
    print("※Mean, standard deviation, standard error")


In [50]:
def get_win_results(df_bet, df_kekka):
    """
    Summary:
        Takes single-win bets as input, merges them with payout data, and outputs the result.

    Args:
        df_bet (pd.DataFrame): Data containing raceid and win predictions
        df_kekka (pd.DataFrame): Payout data extracted from JRA-VAN

    Returns:
        df_results (pd.DataFrame): Aggregated data for each race

    Note:
        Because ties can occur, 'true' is a list and 'pay' is a dict
    """

    # Aggregate by raceid
    res = []
    for raceid in df_bet["raceid"].tolist():
        # Get predictions
        pred = df_bet[df_bet["raceid"]==raceid]["win"].values[0]  # list of predicted horses
        # Check win or loss
        try:
            tmp = df_kekka[df_kekka.raceid==raceid].iloc[0].to_dict()  # target race
        except:
            print("Payout data for this race does not exist.", raceid)
            continue

        # Actual winners (true) and their payouts (pay)
        true = [x["Umaban"] for x in tmp["PayTansyo"] if x["Umaban"] != '  ']  # list because ties possible
        pay = {x["Umaban"]: int(x["Pay"]) for x in tmp["PayTansyo"] if x["Umaban"] != '  '}
        henkan = sum([int(tmp["HenkanUma"][x-1]) for x in pred])  # refunded amounts
        pred = [f"{x:02}" for x in pred]  # convert to string

        res.append({"pred": pred, "true": true, "pay": pay, "henkan": henkan, "raceid": raceid})     

    df_res = pd.DataFrame(res)
    get_reports(df_res)

    return df_res


In [51]:
def evaluate(xtest, ytest, ypred, harai):
    """
    Evaluate predictions for horse races.

    Args:
        xtest (pd.DataFrame): Test data containing race and horse info
        ytest (np.array): True ranks of horses
        ypred (np.array): Predicted scores for horses
        harai (pd.DataFrame): Payout data

    Returns:
        df_bet (pd.DataFrame): Betting predictions per race
        df_res (pd.DataFrame): Aggregated results with payouts
    """

    # 'jyuni': store the true ranks in a matrix of shape (num_races × 18)
    tmp = xtest.copy()
    tmp["jyuni"] = ytest
    tmp = tmp.groupby("raceid", as_index=False).apply(lambda x: x["jyuni"].values)
    jyuni = np.array([[tmp[i][j] if j < len(tmp[i]) else j+1 for j in range(18)] 
                      for i in range(tmp.shape[0])])  # padding for missing horses
    print("jyuni\n", jyuni)

    # 'pred': store the predicted ranks in a matrix of shape (num_races × 18) 
    # sorted by descending score
    tmp = xtest.copy()
    tmp["pred"] = ypred
    tmp = tmp.groupby("raceid", as_index=False).apply(
        lambda x: np.argsort(np.argsort(-x["pred"].values)) + 1
    )
    pred = np.array([[tmp[i][j] if j < len(tmp[i]) else j+1 for j in range(18)] 
                     for i in range(tmp.shape[0])])  # padding
    print("pred\n", pred)

    # Compute MRR (Mean Reciprocal Rank)
    mrr = calc_mrr(pred, jyuni)
    print("mrr\n", mrr)
        
    # df_bet: extract unique raceid rows
    df_bet = xtest[["raceid"]].groupby("raceid", as_index=False).apply(lambda x: x.iloc[0])

    # win: buy the horse ranked #1 by predicted score (single win)
    kaime = [[np.where(pred[i, :] == 1)[0][0] + 1] for i in range(pred.shape[0])]
    df_bet["win"] = kaime

    # Align payout data
    harai = harai.copy()
    harai.columns = [x.replace("RaceID", "raceid") for x in harai.columns]
    harai.raceid = harai.raceid.astype(int)

    # Compute and display results
    df_res = get_win_results(df_bet, harai)

    return df_bet, df_res


In [None]:
# Evaluate every model
results = {}

for model_name, model_ypred in model_prediction_winner.items():
    print(f"---- Evaluating {model_name} ----")
    df_bet, df_res = evaluate(xtest, yall[xtest.index].reset_index(drop=True).copy(), model_ypred, harai)
    results[model_name] = {"df_bet": df_bet, "df_res": df_res}

---- Evaluating XGBoost ----


  tmp = tmp.groupby("raceid", as_index=False).apply(lambda x: x["jyuni"].values)


jyuni
 [[ 2 12  3 ... 14 17 18]
 [ 8  2  4 ...  3 17 18]
 [ 4 12  5 ...  3 17 18]
 ...
 [12 13  2 ... 16 17 18]
 [ 0 10  7 ... 16 17 18]
 [ 2  3  8 ... 12 17 18]]


  tmp = tmp.groupby("raceid", as_index=False).apply(
  df_bet = xtest[["raceid"]].groupby("raceid", as_index=False).apply(lambda x: x.iloc[0])


pred
 [[10  8 11 ... 13 17 18]
 [ 5  1 12 ...  9 17 18]
 [ 8  5  4 ... 13 17 18]
 ...
 [12 11  1 ... 16 17 18]
 [ 7  4 11 ... 16 17 18]
 [ 8 16  7 ... 13 17 18]]
mrr
 0.23836750996270778
Number of races 2856
Number of hits 17
Hit rate 0.005952380952380952 0.07693518441053837 0.0014396134567519108
Total payout 61060
Return rate 0.21379551820728293 3.3596094998969406 0.06286511278993824
※Mean, standard deviation, standard error


### Fukusyo

# Prediction - Ranking

## Models

In [None]:
model_prediction_ranking = {}

### LightGBM (rank:pair-wise)

### XGBoost (rank:pair-wise)

### Catboost (rank:pair-wise)

### Random Forest Classifier

## Evaluation

### Tansyo

### Fukusyo

### Wakuren

### Umaren

### Wide

# Prediction - Time

## Models

In [None]:
model_prediction_time = {}

### LightGBM Regressor

### XGBoost Regressor

### Catboost Regressor

### Neural Network Regressor

## Evaluation

### Tansyo 

### Fukusyo

### Wakuren

### Umaren

### Wide

# Results - Methods comparison