In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


perform = pd.read_csv('./horse/data/perform_clean.csv', sep=',', encoding='utf-8')
perform['speed'] = perform['distance']/perform['finish_time']
perform['is_champ'] = perform['pla'].apply(lambda x: 1 if x==1 else 0)

y_cols = ['is_champ', 'pla', 'finish_time', 'speed']
date_cols = ['race_key', 'race_date']

get_x_cols = lambda x: [col for col in x if (col not in y_cols) and (col not in date_cols)]
x_cols = get_x_cols(perform.columns)

perform.head(5)

Unnamed: 0,race_key,race_date,dr,distance,field_going,course_type,race_money,act_wt,declare_horse_wt,win_odds,horse,jockey,trainer,is_champ,pla,finish_time,speed
0,2015/04/22_1,2015/04/22,2,1000,好地至快地,草地,575000,120,1186,7.3,有情風(N139),薛寶力,吳定強,1,1,57.37,17.430713
1,2015/04/22_1,2015/04/22,3,1000,好地至快地,草地,575000,132,1022,6.1,樂趣(S150),韋達,霍利時,0,2,57.53,17.382235
2,2015/04/22_1,2015/04/22,5,1000,好地至快地,草地,575000,121,1085,48.0,穩佔先機(N359),連達文,苗禮德,0,3,57.78,17.307027
3,2015/04/22_1,2015/04/22,7,1000,好地至快地,草地,575000,127,1211,7.8,上浦勇將(P285),田泰安,徐雨石,0,4,57.82,17.295054
4,2015/04/22_1,2015/04/22,1,1000,好地至快地,草地,575000,124,1088,14.0,大地王者(L251),黎海榮,李易達,0,5,57.89,17.274141


In [2]:
perform.shape

(54436, 17)

In [2]:
perform.is_champ.sum()

4532

In [12]:
dm_perform = pd.get_dummies(perform, columns=['field_going', 'course_type', 'horse', 'jockey', 'trainer']) # trainsform dummies

from horse.process import train_test_split

dm_perform_train, dm_perform_test = train_test_split(dm_perform, 'race_date', 0.1)
dm_perform_train, dm_perform_val = train_test_split(dm_perform_train, 'race_date', 0.1)

dm_perform.shape, dm_perform_train.shape, dm_perform_val.shape, dm_perform_test.shape

((54436, 4682), (43732, 4682), (4983, 4682), (5721, 4682))

In [13]:
# Loading toolkits

from horse.process import AveragePrecision, racing_champ

remove_odds = lambda x: [col for col in x if col != 'win_odds']

def comupte_champ(df, model, x_cols, kind='clf', way='min'):
    X = df[x_cols]
    result = df[['race_key', 'dr']]

    if kind=='clf':
        result['win'] = model.predict_proba(X)[:, 1]
    elif kind=='reg':
        result['win'] = model.predict(X)

    func = min if way=='min' else max

    return result.groupby(['race_key']) \
                .apply(lambda x: x[x['win']==func(x['win'])]) \
                .reset_index(drop=True)[['race_key', 'dr']]

# 1 Racing as Classification

## 1.1 consider odds

In [35]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# data
x_cols = get_x_cols(dm_perform_train.columns)

X, y = dm_perform_train[x_cols], dm_perform_train['is_champ']

# tunning
models = {
    'LogisticRegression':LogisticRegression()
    , 'DecisionTreeClassifier':DecisionTreeClassifier(max_depth=50)
    , 'RandomForestClassifier':RandomForestClassifier(max_depth=50)
    , 'AdaBoostClassifier':AdaBoostClassifier()
}

val_target = racing_champ(dm_perform_val)


for name in models:
    # train on training set
    models[name].fit(X, y)
    print(f'DONE {name}.')
    # eval on validation set
    pred = comupte_champ(dm_perform_val, models[name], x_cols)
    print(f'AP for {name}: {round(AveragePrecision(input=pred, target=val_target), 4)}')

DONE LogisticRegression.
AP for LogisticRegression: 0.0729
DONE DecisionTreeClassifier.
AP for DecisionTreeClassifier: 0.1913
DONE RandomForestClassifier.
AP for RandomForestClassifier: 0.3012
DONE AdaBoostClassifier.
AP for AdaBoostClassifier: 0.3085


## 1.2 Not consider odds

In [37]:


# data
x_cols = get_x_cols(dm_perform_train.columns)

X, y = dm_perform_train[remove_odds(x_cols)], dm_perform_train['is_champ']

# tunning
models = {
    'LogisticRegression':LogisticRegression()
    , 'DecisionTreeClassifier':DecisionTreeClassifier(max_depth=50)
    , 'RandomForestClassifier':RandomForestClassifier(max_depth=50)
    , 'AdaBoostClassifier':AdaBoostClassifier()
}

val_target = racing_champ(dm_perform_val)


for name in models:
    # train on training set
    models[name].fit(X, y)
    print(f'DONE {name}.')
    # eval on validation set
    pred = comupte_champ(dm_perform_val, models[name], remove_odds(x_cols))
    print(f'AP for {name}: {round(AveragePrecision(input=pred, target=val_target), 4)}')

DONE LogisticRegression.
AP for LogisticRegression: 0.0612
DONE DecisionTreeClassifier.
AP for DecisionTreeClassifier: 0.1522
DONE RandomForestClassifier.
AP for RandomForestClassifier: 0.2612
DONE AdaBoostClassifier.
AP for AdaBoostClassifier: 0.2612


In [39]:
f_imp = list(zip(remove_odds(x_cols), models['RandomForestClassifier'].feature_importances_))

sorted(f_imp, key=lambda x: x[1], reverse=True)[:15]

[('declare_horse_wt', 0.08764925930322659),
 ('dr', 0.08161732488194812),
 ('act_wt', 0.07377448163907029),
 ('race_money', 0.059149047024692925),
 ('jockey_莫雷拉', 0.043555496984477936),
 ('jockey_潘頓', 0.041526564416187746),
 ('distance', 0.03562557256518506),
 ('field_going_好地', 0.020147412997150265),
 ('field_going_好地至快地', 0.017774191899829983),
 ('horse_金鎗六十(C238)', 0.008236686048908834),
 ('trainer_蔡約翰', 0.006694206510603512),
 ('jockey_田泰安', 0.006171536962392108),
 ('field_going_好地至黏地', 0.005422213637029039),
 ('horse_美麗傳承(V380)', 0.00478976250798933),
 ('course_type_草地', 0.0044577475795249885)]

# 2 Racing as Regression

## 2.1 Regression on Finish Time

In [33]:
from horse.process import AveragePrecision, racing_champ

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import time

# data
x_cols = get_x_cols(dm_perform_train.columns)
X, y = dm_perform_train[x_cols], dm_perform_train['finish_time']

# tunning
models2_1 = {
    'ridge_regression':Ridge()
    , 'DecisionTreeRegressor':DecisionTreeRegressor(max_depth=15)
    , 'RandomForestRegressor':RandomForestRegressor(max_depth=15
                                                    , n_estimators=25
                                                    , min_samples_leaf=2)
    , 'AdaBoostRegressor':AdaBoostRegressor()
}

val_target = racing_champ(dm_perform_val)

for name in models2_1:
    # train on training set
    t0 = time.time()
    # train on training set
    models2_1[name].fit(X, y)
    t1=time.time()
    print(f'[{round(t1-t0, 3)} s] DONE {name}.')
    # eval on validation set
    pred = comupte_champ(dm_perform_val, models2_1[name], x_cols, kind='reg', way='min')
    print(f'AP for {name}: {round(AveragePrecision(input=pred, target=val_target), 4)}')
    

[15.375 s] DONE ridge_regression.
AP for ridge_regression: 0.1412
[8.986 s] DONE DecisionTreeRegressor.
AP for DecisionTreeRegressor: 0.1567
[185.004 s] DONE RandomForestRegressor.
AP for RandomForestRegressor: 0.2763
[1398.162 s] DONE AdaBoostRegressor.
AP for AdaBoostRegressor: 0.0921


In [34]:
from horse.process import AveragePrecision, racing_champ
import time
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# data
x_cols = get_x_cols(dm_perform_train.columns)
X, y = dm_perform_train[remove_odds(x_cols)], dm_perform_train['finish_time']

# tunning
models2_2 = {
    'ridge_regression':Ridge()
    , 'DecisionTreeRegressor':DecisionTreeRegressor(max_depth=15)
    , 'RandomForestRegressor':RandomForestRegressor(max_depth=15
                                                    , n_estimators=25
                                                    , min_samples_leaf=2)
    , 'AdaBoostRegressor':AdaBoostRegressor()
}

val_target = racing_champ(dm_perform_val)

for name in models2_2:
    t0 = time.time()
    # train on training set
    models2_2[name].fit(X, y)
    t1=time.time()
    print(f'[{round(t1-t0, 3)} s] DONE {name}.')
    # eval on validation set
    pred = comupte_champ(dm_perform_val, models2_2[name], remove_odds(x_cols), kind='reg', way='min')
    print(f'AP for {name}: {round(AveragePrecision(input=pred, target=val_target), 4)}')
    

[12.693 s] DONE ridge_regression.
AP for ridge_regression: 0.1412
[6.841 s] DONE DecisionTreeRegressor.
AP for DecisionTreeRegressor: 0.0898
[94.198 s] DONE RandomForestRegressor.
AP for RandomForestRegressor: 0.1647
[1354.676 s] DONE AdaBoostRegressor.
AP for AdaBoostRegressor: 0.0889


In [30]:
def comupte_champ(df, model, x_cols, kind='clf', way='max'):
    X = df[x_cols]
    result = df[['race_key', 'dr']]

    if kind=='clf':
        result['win'] = model.predict_proba(X)[:, 1]
    
    elif kind=='reg':
        result['win'] = model.predict(X)

    if way=='max':
        return result.groupby(['race_key']) \
                .apply(lambda x: x[x['win']==x['win'].max()]) \
                .reset_index(drop=True)[['race_key', 'dr']]
    else:
        return result.groupby(['race_key']) \
                .apply(lambda x: x[x['win']==x['win'].min()]) \
                .reset_index(drop=True)[['race_key', 'dr']]

In [37]:
from horse.process import AveragePrecision, racing_champ
import time
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# data
x_cols = get_x_cols(dm_perform_train.columns)
X, y = dm_perform_train[x_cols], dm_perform_train['speed']

# tunning
models2_3 = {
    'ridge_regression':Ridge()
    , 'DecisionTreeRegressor':DecisionTreeRegressor(max_depth=15)
    , 'RandomForestRegressor':RandomForestRegressor(max_depth=15
                                                    , n_estimators=25
                                                    , min_samples_leaf=2)
    , 'AdaBoostRegressor':AdaBoostRegressor()
}

val_target = racing_champ(dm_perform_val)

for name in models2_3:
    t0 = time.time()
    # train on training set
    models2_3[name].fit(X, y)
    t1=time.time()
    print(f'[{round(t1-t0, 3)} s] DONE {name}.')
    # eval on validation set
    pred = comupte_champ(dm_perform_val, models2_3[name], x_cols, kind='reg', way='max')
    print(f'AP for {name}: {round(AveragePrecision(input=pred, target=val_target), 4)}')
    

[16.961 s] DONE ridge_regression.
AP for ridge_regression: 0.1318
[8.879 s] DONE DecisionTreeRegressor.
AP for DecisionTreeRegressor: 0.1473
[271.759 s] DONE RandomForestRegressor.
AP for RandomForestRegressor: 0.2974
[826.537 s] DONE AdaBoostRegressor.
AP for AdaBoostRegressor: 0.1018


In [36]:
from horse.process import AveragePrecision, racing_champ
import time
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# data
x_cols = get_x_cols(dm_perform_train.columns)
X, y = dm_perform_train[remove_odds(x_cols)], dm_perform_train['speed']

# tunning
models2_4 = {
    'ridge_regression':Ridge()
    , 'DecisionTreeRegressor':DecisionTreeRegressor(max_depth=15)
    , 'RandomForestRegressor':RandomForestRegressor(max_depth=15
                                                    , n_estimators=25
                                                    , min_samples_leaf=2)
    , 'AdaBoostRegressor':AdaBoostRegressor()
}

val_target = racing_champ(dm_perform_val)

for name in models2_4:
    t0 = time.time()
    # train on training set
    models2_4[name].fit(X, y)
    t1=time.time()
    print(f'[{round(t1-t0, 3)} s] DONE {name}.')
    # eval on validation set
    pred = comupte_champ(dm_perform_val, models2_4[name], remove_odds(x_cols), kind='reg', way='max')
    print(f'AP for {name}: {round(AveragePrecision(input=pred, target=val_target), 4)}')
    

[12.905 s] DONE ridge_regression.
AP for ridge_regression: 0.1341
[7.035 s] DONE DecisionTreeRegressor.
AP for DecisionTreeRegressor: 0.0899
[94.631 s] DONE RandomForestRegressor.
AP for RandomForestRegressor: 0.1846
[1110.578 s] DONE AdaBoostRegressor.
AP for AdaBoostRegressor: 0.0792
