# import Libraries
pandas: for data reading and preprocessing<br>
tensorflow: for neural network construction<br>
sklearn.preprocessing: for data encoding<br>
sklearn.model_selection: it has convenient method for training/test data spliting<br>
matplotlib.pyplot: to plot performance of the training process.<br>

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras.layers import Flatten
from keras.layers import Input
from keras.models import Model
from keras.optimizers import SGD
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# Read Dataset

In [12]:
dataset = pd.read_csv('feature selection1.csv', index_col=False)

In [13]:
print(dataset.head())

   venuename  racedistance  horseid1  row1  trainer1  driver1  handicap1  \
0         28          1755    784858    13      3154      895          0   
1         51          2147    777950     3      1180      902          0   
2         28          2160    803659    13      2913      726          0   
3         51          1758    768857     7      2944      650          0   
4         28          2160    776812     1       290      954          0   

   age1  horseid2  row2  ...  place10  place11  place12  place13  place14  \
0     4    776424     6  ...       10        0        0        0        0   
1     5    761352    13  ...       10        0        0        0        0   
2     4    786617     1  ...       10        0        0        0        0   
3     6    220555     1  ...        0        0        0        0        0   
4     5    782182     6  ...       10        0        0        0        0   

   place15  place16  place17  place18  place19  
0        0        0        0   

# Prepare training/test data

Select right columns for X, y<br>
<ul>
    <li> select all the data except last 19 columns, because last 19 columns are is about 'place'</li>
    <li> Select last 19 columns for y</li>
</ul>
Split data into train/test sets
<ul>
    <li>80% for training</li>
    <li>20% for testing(validation)</li>
</ul>

In [14]:
X = dataset[dataset.columns[:-19]]
y = dataset[dataset.columns[-19:]]

In [16]:
print(X.shape)
print(y.shape)

# split data into train and test sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

(66804, 116)
(66804, 19)


In [25]:
print(y_train)

       place1  place2  place3  place4  place5  place6  place7  place8  place9  \
45506       1       2       3       4       5       6       7       8       9   
51526       1       2       3       4       5       6       7       8       9   
28679       1       2       3       4       5       6       0       0       0   
19589       1       2       3       4       5       6       7       8       9   
57          1       2       3       4       5       6       0       0       0   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
49100       1       2       3       4       5       6       7       8       9   
20609       1       2       3       4       5       6       7       8       0   
21440       1       2       3       4       5       6       7       8       9   
50057       1       2       3       4       5       6       7       8       0   
5192        1       2       3       4       5       6       7       8       9   

       place10  place11  pl

# Build the model
Use keras to build the model with easy-to-use api Sequential<br>
Have to mention that input layer has 116 inputs. The calculation is following:
<ul>
    <li>2 features from race data - venuename, racedistance</li>
    <li>19 horses has 6 features - horseid,row, trainer, driver, handicap, age</li>
</ul>
Output layer has 14 nodes

In [57]:
# create model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(256, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
# model.add(Dropout(0.1))
# model.add(Dense(60, kernel_initializer='normal', activation='relu'))
# model.add(Dropout(0.1))
# model.add(Dense(30, kernel_initializer='normal', activation='relu'))
# model.add(Dense(60, kernel_initializer='normal', activation='relu'))
# model.add(Dense(30, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(64, kernel_initializer='normal', activation='relu'))
model.add(Dense(19))
# initialize the optimizer
opt = Adam(lr=0.0001)
# opt = SGD(lr=0.01, nesterov=True, momentum=0.9)
# Compile model
model.compile(loss='mse', metrics=['mae'], optimizer=opt)

In [58]:
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 128)               14976     
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 256)               33024     
_________________________________________________________________
dropout_12 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 64)               

# Train the model


In [59]:
from datetime import datetime
cur_date_time = datetime.strftime(datetime.now(), '%Y%m%d-%H-%M')
model_weight_name = 'model-128-256-128-64-19(all relu)-' + cur_date_time + '-' + 'epoch={epoch:02d}-val_loss={val_loss:.6f}.hdf5'
model_structure_name = 'model-128-256-128-64-19(all relu)-' + cur_date_time + '.json'
print(model_weight_name)

model-128-256-128-19(all relu)-20210112-07-56-epoch={epoch:02d}-val_loss={val_loss:.6f}.hdf5


In [60]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(model_weight_name, monitor='val_loss', mode='min', save_best_only=True, verbose=1)

In [62]:
#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=5)
#train model
H = model.fit(X_train, y_train, validation_split=0.2, epochs=10000, callbacks=[checkpoint], batch_size=8)

Epoch 1/10000

Epoch 00001: val_loss improved from inf to 13.73851, saving model to model-128-256-128-19(all relu)-20210112-07-56-epoch=01-val_loss=13.738507.hdf5
Epoch 2/10000

Epoch 00002: val_loss improved from 13.73851 to 12.07465, saving model to model-128-256-128-19(all relu)-20210112-07-56-epoch=02-val_loss=12.074646.hdf5
Epoch 3/10000

Epoch 00003: val_loss improved from 12.07465 to 10.56869, saving model to model-128-256-128-19(all relu)-20210112-07-56-epoch=03-val_loss=10.568688.hdf5
Epoch 4/10000

Epoch 00004: val_loss improved from 10.56869 to 9.22481, saving model to model-128-256-128-19(all relu)-20210112-07-56-epoch=04-val_loss=9.224807.hdf5
Epoch 5/10000

Epoch 00005: val_loss improved from 9.22481 to 8.06118, saving model to model-128-256-128-19(all relu)-20210112-07-56-epoch=05-val_loss=8.061178.hdf5
Epoch 6/10000

Epoch 00006: val_loss improved from 8.06118 to 7.20316, saving model to model-128-256-128-19(all relu)-20210112-07-56-epoch=06-val_loss=7.203156.hdf5
Epoch


Epoch 00032: val_loss did not improve from 4.73126
Epoch 33/10000

Epoch 00033: val_loss did not improve from 4.73126
Epoch 34/10000

Epoch 00034: val_loss did not improve from 4.73126
Epoch 35/10000

Epoch 00035: val_loss did not improve from 4.73126
Epoch 36/10000

Epoch 00036: val_loss did not improve from 4.73126
Epoch 37/10000

Epoch 00037: val_loss did not improve from 4.73126
Epoch 38/10000

Epoch 00038: val_loss did not improve from 4.73126
Epoch 39/10000

Epoch 00039: val_loss did not improve from 4.73126
Epoch 40/10000

Epoch 00040: val_loss did not improve from 4.73126
Epoch 41/10000

Epoch 00041: val_loss did not improve from 4.73126
Epoch 42/10000

Epoch 00042: val_loss did not improve from 4.73126
Epoch 43/10000

Epoch 00043: val_loss did not improve from 4.73126
Epoch 44/10000

Epoch 00044: val_loss did not improve from 4.73126
Epoch 45/10000

Epoch 00045: val_loss did not improve from 4.73126
Epoch 46/10000

Epoch 00046: val_loss did not improve from 4.73126
Epoch 47/1

KeyboardInterrupt: 

In [63]:
type(model)

tensorflow.python.keras.engine.sequential.Sequential

In [64]:
model.load_weights('model-128-256-128-19(all relu)-20210112-07-56-epoch=23-val_loss=4.731262.hdf5')

In [65]:
#example on how to use our newly trained model on how to make predictions on unseen data (we will pretend our new data is saved in a dataframe called 'test_X').
test_y_predictions = model.predict(X_test)

In [69]:
print(test_y_predictions[-1])

[ 1.0374131e+00  2.0340316e+00  3.0188518e+00  4.0087247e+00
  4.9829860e+00  5.8615375e+00  6.4179277e+00  6.4231176e+00
  5.6362963e+00  4.2649417e+00  1.8814851e+00  1.1203094e+00
  4.1048028e-02  2.1488179e-02  8.9699412e-03  6.5667960e-03
  4.4540535e-03  3.1743569e-03 -2.8695455e-03]


In [54]:
print(y_test)

       place1  place2  place3  place4  place5  place6  place7  place8  place9  \
20603       1       2       3       4       5       6       7       8       9   
52177       1       2       3       4       5       6       7       8       9   
40573       1       2       3       4       5       6       7       8       9   
21921       1       2       3       4       5       6       7       8       0   
51071       1       2       3       4       5       6       7       8       9   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
53953       1       2       3       4       5       6       7       8       9   
34890       1       2       3       4       5       6       7       8       0   
44814       1       2       3       4       5       6       7       8       9   
45994       1       2       3       4       5       6       7       0       0   
6764        1       2       3       4       5       6       7       0       0   

       place10  place11  pl

In [70]:
type(X_test)

pandas.core.frame.DataFrame