## Horse racing prediction  

This is an experiment to predict the outcome of horse racing based on past 5 race results, jockey, and trainer.



## Prepare data

In [1]:
# import pymongo
import numpy as np
from pymongo import MongoClient
client = MongoClient()

db = client.keiba
training_data = db.training_data_Kisyu_Kyusya_1_race_5_with_odds
data_models = db.data_models_Kisyu_Kyusya_1_race_5_with_odds

In [2]:
#
# > db.training_data_Kisyu_Kyusya_1_race_5_with_odds.count({'input_x_count':105})
# 9245046
#
# > db.training_data_Kisyu_Kyusya_1_race_5_with_odds.count()
# 9247112
#
# for some reasons, 2,066 record don't have 105 features...
# 

training_data_cursor = training_data.find({'random_index': {'$gt':19}}, no_cursor_timeout=True)
validation_data_cursor = training_data.find({'random_index': {'$lt':20}}, no_cursor_timeout=True)

# too slow to use
# training_data_cursor = training_data.find({'random_index': {'$gt':19}, 'input_x_count':105})
# validation_data_cursor = training_data.find({'random_index': {'$lt':20, 'input_x_count':105}})

In [3]:
training_data_count = training_data_cursor.count()
print("training_data_count: {}".format(training_data_count))

validation_data_count = validation_data_cursor.count()
print("validation_data_count: {}".format(validation_data_count))

training_data_count: 7488116
validation_data_count: 1756930


In [4]:
# get std and mean. we use data_model later
mean_and_std = data_models.find_one({})

In [5]:
# get batch as generator
# batch_size is mini batch size
# data_type is training or validation

#
# pymongo.errors.CursorNotFound: cursor id '…' not valid at server
#

# do not use try exception model,
# can not chatch error if we use it.


def data_generator(batch_size, data_type):
    
#     still_have_data_flg = True
    
    input_X = np.zeros(shape=(batch_size, 105), dtype=float)
    target_Y = np.zeros(shape=(batch_size, 1), dtype=float)
    
    while True:
        
        for idx1 in range(batch_size):
            
            # get one row
            data1 = None
            if data_type == 'validation':
                data1 = validation_data_cursor.next()
            else:
                data1 = training_data_cursor.next()

            # normalize x values
            for idx2 in data1['input_x_object']:

                # get model data which contains mean and std
                x1 = data1['input_x_object'][idx2]

                mean_name = 'input_x_avg_'+idx2
                mean_value = mean_and_std['mean_and_std'][mean_name]

                std_name = 'input_x_std_'+idx2
                std_value = mean_and_std['mean_and_std'][std_name]

                normarized_x = (x1 - mean_value) / std_value
                input_X[idx1, int(idx2)] = normarized_x

            # normarize y value
            y1 = data1['target_y']
            y_mean_value = mean_and_std['mean_and_std']['target_y_mean']
            y_std_value = mean_and_std['mean_and_std']['target_y_stddev']
            normalized_y = (y1 - y_mean_value) / y_std_value

            target_Y[idx1] = normalized_y

            yield (input_X, target_Y)


In [6]:
#  y is 1 or 0

def data_generator_binary(batch_size, data_type):
    
#     still_have_data_flg = True
    
    input_X = np.zeros(shape=(batch_size, 105), dtype=float)
    target_Y = np.zeros(shape=(batch_size, 1), dtype=float)
    
    while True:
        
        for idx1 in range(batch_size):
            
            # get one row
            data1 = None
            if data_type == 'validation':
                data1 = validation_data_cursor.next()
            else:
                data1 = training_data_cursor.next()

            # normalize x values
            for idx2 in data1['input_x_object']:

                # get model data which contains mean and std
                x1 = data1['input_x_object'][idx2]

                mean_name = 'input_x_avg_'+idx2
                mean_value = mean_and_std['mean_and_std'][mean_name]

                std_name = 'input_x_std_'+idx2
                std_value = mean_and_std['mean_and_std'][std_name]

                normarized_x = (x1 - mean_value) / std_value
                input_X[idx1, int(idx2)] = normarized_x

            # normarize y value
            y1 = data1['target_y']
            if y1 >= 0:
                target_Y[idx1] = 1
            else:
                target_Y[idx1] = 0
#             y_mean_value = mean_and_std['mean_and_std']['target_y_mean']
#             y_std_value = mean_and_std['mean_and_std']['target_y_stddev']
#             normalized_y = (y1 - y_mean_value) / y_std_value

#             target_Y[idx1] = normalized_y

            yield (input_X, target_Y)


## Create model 

In [7]:
# import dependancies
import keras
from keras import metrics, initializers

from keras_tqdm import TQDMNotebookCallback
from keras.layers import Dropout, Dense, LeakyReLU, BatchNormalization, Activation
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD, Adam, RMSprop


Using TensorFlow backend.
  return f(*args, **kwds)


In [8]:
# model 1: 3 layers, LeakyReLU, and dropout
model_1 = Sequential()

model_1.add(Dense(128, input_shape=(105,), activation=None))
model_1.add(LeakyReLU(alpha=0.3))
model_1.add(Dropout(0.2))

model_1.add(Dense(256, activation=None))
model_1.add(LeakyReLU(alpha=0.3))
model_1.add(Dropout(0.2))

model_1.add(Dense(128, activation=None))
model_1.add(LeakyReLU(alpha=0.3))
model_1.add(Dropout(0.2))

model_1.add(Dense(1, activation=None))

model_1.compile(optimizer='rmsprop',
              loss='mean_squared_error',
              metrics=[metrics.mae])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               13568     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
__________

In [9]:
# training model_1

# add checkpointer
save_model_name = "keiba_model_1.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

minibatch_size = 32

steps_per_epoch = training_data_count // minibatch_size
validation_steps = validation_data_count // minibatch_size

model_1.fit_generator(generator=data_generator(batch_size=minibatch_size, data_type='training'),
                    steps_per_epoch=steps_per_epoch,
                    validation_data=data_generator(batch_size=minibatch_size, data_type='validation'),
                    validation_steps=validation_steps,
                    epochs=20,
                    callbacks=[checkpointer])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
{'_id': 'Kisyu_Kyusya_1_race_5_with_odds-20140126071120071027752007104975', 'data_name': 'Kisyu_Kyusya_1_race_5_with_odds', 'input_x': [1800, 3125, 7, 560, 500, -2, 0.1863905325443787, 0.10754414125200643, 1228, 7, 590, 502, -2, 15, 377, 10, 1800, 692, 6, 570, 504, 18, 12, 368, 7, 1800, 911, 6, 590, 486, -12, 12, 364, 8, 1800, 192, 6, 560, 498, -8, 15, 365, 9, 1700, 165, 6, 590, 506, 10, 16, 380, 8, 1700, 3125, 7, 560, 500, -2, 0.1683673469387755, 0.1669024045261669, 716, 7, 570, 518, -2, 22, 395, 13, 1800, 1664, 6, 560, 520, -2, 8, 343, 12, 1800, 978, 6, 560, 522, -8, 7, 331, 13, 1200, 128, 6, 570, 530, 4, 10, 351, 10, 1800, 814, 6, 570, 526, 6, 10, 364, 12, 1600], 'target_y': -2, 'input_x_object': {'0': 1800, '1': 3125, '2': 7, '3': 560, '4': 500, '5': -2, 

<keras.callbacks.History at 0x7fb05c083f28>

## Model 2  


In [14]:
# model 2: simple one with 2 layers, LeakyReLU and dropout
model_2 = Sequential()

# keras.initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=None)
# kernel_initializer=initializers.RandomNormal(stddev=0.001)
model_2.add(Dense(64,
                  input_shape=(105,), 
                  activation=None, 
                  bias_initializer=initializers.RandomNormal(stddev=0.001),
                  kernel_initializer=initializers.RandomNormal(stddev=0.001)))
model_2.add(LeakyReLU(alpha=0.2))
model_2.add(Dropout(0.2))

model_2.add(Dense(32, activation=None))
model_2.add(LeakyReLU(alpha=0.2))
model_2.add(Dropout(0.2))

model_2.add(Dense(1, activation=None))

keras.optimizers.SGD(lr=0.001, momentum=0.0, decay=0.0, nesterov=False)

# SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
# Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
# RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

model_2.compile(optimizer=Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
              loss='mean_squared_error',
              metrics=[metrics.mae])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 64)                6784      
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 32)                0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total para

In [15]:
# training model_2

# add checkpointer
save_model_name = "keiba_model_2.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

minibatch_size = 32

steps_per_epoch = training_data_count // minibatch_size
validation_steps = validation_data_count // minibatch_size

model_2.fit_generator(generator=data_generator(batch_size=minibatch_size, data_type='training'),
                    steps_per_epoch=steps_per_epoch,
                    validation_data=data_generator(batch_size=minibatch_size, data_type='validation'),
                    validation_steps=validation_steps,
                    epochs=10,
                    callbacks=[checkpointer])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb05587bdd8>

## model 3

In [7]:
# model 3: simple one  + batch normalization
model_3 = Sequential()


# keras.initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=None)
model_3.add(Dense(64,
                  input_shape=(105,),
                  activation=None,
                  kernel_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None),
                  bias_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None)))
model_3.add(BatchNormalization())
model_3.add(LeakyReLU(alpha=0.2))
model_3.add(Dropout(0.2))

model_3.add(Dense(32, activation=None))
model_3.add(BatchNormalization())
model_3.add(LeakyReLU(alpha=0.2))
model_3.add(Dropout(0.2))

model_3.add(Dense(1, activation=None))

model_3.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
              loss='mean_squared_error',
              metrics=[metrics.mae])

model_3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                6784      
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 32)                0         
__________

In [8]:
# training model_3

# add checkpointer
save_model_name = "keiba_model_3.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

minibatch_size = 64

steps_per_epoch = training_data_count // minibatch_size
validation_steps = validation_data_count // minibatch_size

model_3.fit_generator(generator=data_generator(batch_size=minibatch_size, data_type='training'),
                    steps_per_epoch=steps_per_epoch,
                    validation_data=data_generator(batch_size=minibatch_size, data_type='validation'),
                    validation_steps=validation_steps,
                    epochs=10,
                    callbacks=[checkpointer])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f74bd03e4a8>

## Model 4

In [9]:
# model 4
model_4 = Sequential()

model_4.add(Dense(64,
                  input_shape=(105,),
                  activation=None,
                  kernel_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None),
                  bias_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None)))
# model_4.add(BatchNormalization())
model_4.add(LeakyReLU(alpha=0.2))
# model_4.add(Dropout(0.2))

model_4.add(Dense(256, activation=None))
# model_4.add(BatchNormalization())
model_4.add(LeakyReLU(alpha=0.2))
# model_4.add(Dropout(0.2))

model_4.add(Dense(128, activation=None))
# model_4.add(BatchNormalization())
model_4.add(LeakyReLU(alpha=0.2))
# model_4.add(Dropout(0.2))

model_4.add(Dense(1, activation=None))

# SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
# sgd = SGD(lr=0.001, decay=1e-6, momentum=0.5, nesterov=True)?
# model_4.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
model_4.compile(optimizer=SGD(lr=0.001, decay=1e-6, momentum=0.5, nesterov=True),
              loss='mean_squared_error',
              metrics=[metrics.mae])

model_4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 64)                6784      
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               16640     
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               32896     
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 129       
Total para

In [10]:
# training model_4

# add checkpointer
save_model_name = "keiba_model_4.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

minibatch_size = 64

steps_per_epoch = training_data_count // minibatch_size
validation_steps = validation_data_count // minibatch_size

model_4.fit_generator(generator=data_generator(batch_size=minibatch_size, data_type='training'),
                    steps_per_epoch=steps_per_epoch,
                    validation_data=data_generator(batch_size=minibatch_size, data_type='validation'),
                    validation_steps=validation_steps,
                    epochs=10,
                    callbacks=[checkpointer])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbd00ae8128>

## model 5 binary target_Y ( 0 or 1 )

In [23]:
# model 5: use binary target_Y (0 or 1)
model_5 = Sequential()

model_5.add(Dense(64,
                  input_shape=(105,),
                  activation=None,
                  kernel_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None),
                  bias_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None)))
# model_4.add(BatchNormalization())
model_5.add(LeakyReLU(alpha=0.2))
# model_4.add(Dropout(0.2))

model_5.add(Dense(256, activation=None))
# model_4.add(BatchNormalization())
model_5.add(LeakyReLU(alpha=0.2))
# model_4.add(Dropout(0.2))

model_5.add(Dense(128, activation=None))
# model_4.add(BatchNormalization())
model_5.add(LeakyReLU(alpha=0.2))
# model_4.add(Dropout(0.2))

model_5.add(Dense(1, activation=None))
model_5.add(Activation('sigmoid'))

# SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
# sgd = SGD(lr=0.001, decay=1e-6, momentum=0.5, nesterov=True)?
# model_4.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),

# i tried lr=0.001 also but both didn't improve loss
# lr=0.001 > lr=0.0001
model_5.compile(optimizer=Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
              loss='binary_crossentropy',
              metrics=[metrics.binary_accuracy, metrics.mae])

model_5.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_37 (Dense)             (None, 64)                6784      
_________________________________________________________________
leaky_re_lu_28 (LeakyReLU)   (None, 64)                0         
_________________________________________________________________
dense_38 (Dense)             (None, 256)               16640     
_________________________________________________________________
leaky_re_lu_29 (LeakyReLU)   (None, 256)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 128)               32896     
_________________________________________________________________
leaky_re_lu_30 (LeakyReLU)   (None, 128)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 1)                 129       
__________

In [24]:
# training model_5

# add checkpointer
save_model_name = "keiba_model_5.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

minibatch_size = 64

steps_per_epoch = training_data_count // minibatch_size
validation_steps = validation_data_count // minibatch_size

model_5.fit_generator(generator=data_generator_binary(batch_size=minibatch_size, data_type='training'),
                    steps_per_epoch=steps_per_epoch,
                    validation_data=data_generator_binary(batch_size=minibatch_size, data_type='validation'),
                    validation_steps=validation_steps,
                    epochs=10,
                    callbacks=[checkpointer])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbcfede38d0>

## model 6

In [8]:
# model 6: use binary target_Y (0 or 1)
model_6 = Sequential()

model_6.add(Dense(64,
                  input_shape=(105,),
                  activation=None,
                  kernel_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None),
                  bias_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None)))
model_6.add(BatchNormalization())
model_6.add(LeakyReLU(alpha=0.2))
# model_6.add(Dropout(0.2))

model_6.add(Dense(256, activation=None))
model_6.add(BatchNormalization())
model_6.add(LeakyReLU(alpha=0.2))
# model_6.add(Dropout(0.2))

model_6.add(Dense(128, activation=None))
model_6.add(BatchNormalization())
model_6.add(LeakyReLU(alpha=0.2))
# model_6.add(Dropout(0.2))

model_6.add(Dense(1, activation=None))
model_6.add(Activation('sigmoid'))

# SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
# sgd = SGD(lr=0.001, decay=1e-6, momentum=0.5, nesterov=True)?
# model_5.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
model_6.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
              loss='binary_crossentropy',
              metrics=[metrics.binary_accuracy, metrics.mae])

model_6.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                6784      
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               16640     
_________________________________________________________________
batch_normalization_2 (Batch (None, 256)               1024      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
__________

In [9]:
# training model_6

# add checkpointer
save_model_name = "keiba_model_6.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

minibatch_size = 64

steps_per_epoch = training_data_count // minibatch_size
validation_steps = validation_data_count // minibatch_size

model_6.fit_generator(generator=data_generator_binary(batch_size=minibatch_size, data_type='training'),
                    steps_per_epoch=steps_per_epoch,
                    validation_data=data_generator_binary(batch_size=minibatch_size, data_type='validation'),
                    validation_steps=validation_steps,
                    epochs=10,
                    callbacks=[checkpointer])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3299ae0390>