In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

from keras import backend as K
config = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
tf.set_random_seed(42)
sess = tf.Session(config=config)
K.set_session(sess)

import keras
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler



DATA_PATH = 'data/'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
league_data = pd.read_csv('data/league_data.csv')
league_data.drop(['Unnamed: 0', 'Date', 'HomeTeam', 'AwayTeam', 
                 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HM1', 'HM2', 'HM3',
                 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPts',
                 'ATFormPts', 'MW', 'HTFormPtsStr', 'ATFormPtsStr'], 1, inplace=True)
league_data.columns

Index(['season', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D', 'B365A', 'gameId',
       'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [3]:
EMA_data = pd.read_csv('data/EMA_data.csv')
EMA_data.drop(['Unnamed: 0', 'f_DateHome', 'f_seasonHome', 'HomeTeam',
               'homeGame_x', 'f_yellowsAgainstAway', 'f_yellowsForAway',
               'f_halfTimeGoalsAgainstHome', 'f_halfTimeGoalsForHome','f_halfTimeGoalsAgainstAway', 'f_halfTimeGoalsForAway',
               'f_yellowsAgainstHome', 'f_yellowsForHome', 'f_DateAway', 'f_seasonAway', 
               'AwayTeam', 'homeGame_y'], 1, inplace=True)
EMA_data.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

In [4]:
df = pd.merge(EMA_data, league_data, left_on='gameId', right_index=True)

In [5]:
# df['season'].apply(str)

In [6]:
df.head()

Unnamed: 0,gameId,gameId_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,...,ATP,B365H,B365D,B365A,gameId_y,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,22,22,8.53,3.0,12.51,9.49,1.0,1.02,0.0,0.0,...,0.333333,1.44,4.0,7.5,23,0.666667,-1.666667,-0.333333,-0.333333,-13
1,23,23,3.62605,3.359819,13.893348,12.291828,0.320091,1.680443,0.0,0.0,...,1.333333,2.2,3.25,3.2,24,1.333333,-1.0,-0.333333,-0.333333,-1
2,24,24,6.49,8.06,14.92,12.02,2.47,1.0,0.0,0.0,...,1.333333,3.1,3.2,2.25,25,0.0,0.333333,0.0,0.0,6
3,25,25,5.96,1.49,7.02,11.47,1.53,0.98,0.0,0.0,...,2.0,2.62,3.1,3.1,26,0.666667,0.333333,-0.666667,-0.666667,1
4,26,26,5.45,10.55,14.53,13.51,1.51,0.0,0.0,0.0,...,0.666667,4.5,3.25,1.83,27,0.666667,-0.333333,1.333333,1.333333,13


# Prepare and split the data

Now that we have loaded our data into a dataframe we will clean the data for our Keras model. Because the model only takes numeric input we will change our labels from strings to integers and use categorical cross-entropy as our loss function. We will also scale our data using sklearn StandardScaler.

First we will seperate the labels from the rest of our data.

In [7]:
training_data = df.loc[df['season'] != 1920].reset_index(drop=True)
testing_data = df.loc[df['season'] == 1920].reset_index(drop=True)

X = training_data.drop(['gameId', 'gameId_x', 'gameId_y', 'FTR', 
           'season', 'gameId_y', ], 1)
Y = training_data['FTR']

X_test = testing_data.drop(['gameId', 'gameId_x', 'gameId_y', 'FTR', 
           'season', 'gameId_y', ], 1)
y_test = testing_data['FTR']

# Use 'season' columns to create training batches
seasons = training_data['season']

In [8]:
def transform_results(results):
    transformed = []
    for i in range(len(results)):
        if results[i] == 'H':
            transformed.append(0)
        elif results[i] == 'A':
            transformed.append(2)
        else:
            transformed.append(1)
    return np.array(transformed)
            
Y = transform_results(Y)
y_test = transform_results(y_test)

In [9]:
scaler = StandardScaler()

X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [10]:
print('Number of matches in training data:', X.shape[0])
print('Number of matches in test data:', X_test.shape[0])
print('Number of features:', X.shape[1])

Number of matches in training data: 4504
Number of matches in test data: 118
Number of features: 34


In [11]:
# Split our data. We are using the most recent season for validation
X_train = X[:-380]
y_train = Y[:-380]
X_val = X[-380:]
y_val = Y[-380:]

# Build the model

Now we have cleaned the data we can now create our model and train it. 

In [12]:
input_dim = X_train.shape[1]

activation_func = 'relu'
kernel_init = 'glorot_normal'
dropout_rate = 0.4
learning_rate = 0.002
batch_size = 16

model = keras.Sequential([
    Dense(50, input_shape=(input_dim,), activation=activation_func),
    Dropout(0.3),
    Dense(16),
    Dropout(0.2),
    Dense(3, activation='softmax')
])

In [13]:
es = EarlyStopping(monitor='loss', patience=3, verbose=1)

In [14]:
from keras.optimizers import Adam, SGD

opt = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.995, epsilon=1e-08, decay=0.0)
# opt = SGD(momentum=0.0, learning_rate=0.01, nesterov=False)

In [15]:
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, 
              callbacks=[es], epochs=500, verbose=0)


Epoch 00021: early stopping


<keras.callbacks.callbacks.History at 0x7f56a6b27400>

In [16]:
_, train_acc = model.evaluate(X_train, y_train)
_, val_acc = model.evaluate(X_val, y_val)
print('Training accuracy:', train_acc)
print('Validation accuracy:', val_acc)

Training accuracy: 0.6355479955673218
Validation accuracy: 0.6526315808296204


We are getting around 63% on training and 64% on validation. This is good as is above the bookies accuracy. Because our validation data is so small we will also look at our training and validation across all the seasons to see if it is consistant. 

In [17]:
# Saving best model
# model.save('17Nov19.h5')

In [18]:
# idx = 0
# train_results = []
# val_results = []

# for season_len in seasons.value_counts():
#     X_train = np.concatenate((X[:idx], X[idx+season_len:]))
#     y_train = np.concatenate((Y[:idx], Y[idx+season_len:]))
#     X_val = X[idx:idx+season_len]
#     y_val = Y[idx:idx+season_len]
    
#     model = dnn_model
#     model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
#     model.fit(X_train, y_train, batch_size=batch_size, 
#                     callbacks=[es], epochs=500, verbose=0)
    
#     _, train_acc = model.evaluate(X_train, y_train)
#     _, val_acc = model.evaluate(X_val, y_val)
#     print('Training accuracy:', train_acc)
#     print('Validation accuracy:', val_acc)
    
#     train_results.append(train_acc)
#     val_results.append(val_acc)
#     idx +=season_len



In [19]:
# print(f'Average training accuracy :{sum(train_results)/len(train_results)}')
# print(f'Min accuracy :{min(train_results)}')
# print(f'Max accuracy :{max(train_results)}')
# print(f'Standard Deviation :{np.std(train_results)}')

In [20]:
# print(f'Average validation accuracy :{sum(val_results)/len(val_results)}')
# print(f'Min accuracy :{min(val_results)}')
# print(f'Max accuracy :{max(val_results)}')
# print(f'Standard Deviation :{np.std(val_results)}')

Let's now check our accuracy on the test set and see how accurate this is. We can then look at creating a betting strategy and see if the model would be profitable. 

In [21]:
_, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.6525423526763916


In [22]:
y_preds = model.predict(X_test)

In [23]:
# Save predictions 
np.save('y_preds.npy', y_preds)

# Create a betting strategy

Now that we have an accurate model let's see if we can make it as profitable as possible.

In [35]:
funds = 100
wager = 10
favourites = 0
no_bets = 0
min_diff = 0.03

for i in range(len(X_test)):
    prediction = np.argmax(y_preds[i])
    print('\nPrediction', prediction)
    print('Actual', y_test[i])
    print('Favourite', np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]))
    print('Prediction proba', y_preds[i])
    print('Home, Draw and Away odds', testing_data['B365H'][i],       
          testing_data['B365D'][i], testing_data['B365A'][i])
    
        

    if prediction == 0:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365H'][i])
        # If odds_diff positive place bet
        if odds_diff > min_diff:
            if  prediction == y_test[i]:
                funds += (wager * testing_data['B365H'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    elif prediction == 1:     
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365D'][i])
        if odds_diff > min_diff:
            if  prediction == y_test[i]:
                funds +=( wager * testing_data['B365D'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    else:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365A'][i])
        if odds_diff >  min_diff:
            if  prediction == y_test[i]:
                funds += (wager * testing_data['B365A'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1

        
    if prediction == np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]):
        favourites +=1
    
    print('Funds', funds)
    
print(f'Betted on favourite {favourites} times out of {len(X_test)} matches.')
print(f'No bet placed {no_bets} times')
         


Prediction 0
Actual 0
Favourite 0
Prediction proba [0.5430389  0.25684983 0.20011123]
Home, Draw and Away odds 1.14 10.0 19.0
Funds 100

Prediction 2
Actual 2
Favourite 2
Prediction proba [0.08087105 0.22293802 0.69619095]
Home, Draw and Away odds 12.0 6.5 1.22
Funds 100

Prediction 0
Actual 1
Favourite 0
Prediction proba [0.37600842 0.30000865 0.32398286]
Home, Draw and Away odds 1.95 3.6 3.6
Funds 100

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.57025194 0.30299962 0.12674843]
Home, Draw and Away odds 2.62 3.2 2.75
Funds 116.2

Prediction 2
Actual 1
Favourite 2
Prediction proba [0.3167887  0.26941282 0.4137985 ]
Home, Draw and Away odds 3.0 3.25 2.37
Funds 116.2

Prediction 0
Actual 2
Favourite 0
Prediction proba [0.37060124 0.29953223 0.3298665 ]
Home, Draw and Away odds 1.9 3.4 4.0
Funds 116.2

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.35839924 0.32007053 0.3215302 ]
Home, Draw and Away odds 1.3 5.25 10.0
Funds 116.2

Prediction 0
Actual 1
Favourite 0
Predict