In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

from keras import backend as K
config = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
tf.set_random_seed(42)
sess = tf.Session(config=config)
K.set_session(sess)

import keras
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler



DATA_PATH = 'data/'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# Prepare the data

We first need to load both our league-data and EMA data and combine these together. 

In [5]:
league_data = pd.read_csv('data/league_data.csv')
league_data.drop(['Unnamed: 0', 'Date', 'HomeTeam', 'AwayTeam', 
                 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HM1', 'HM2', 'HM3', 
                 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPts',
                 'ATFormPts', 'MW', 'HTFormPtsStr', 'ATFormPtsStr'], 1, inplace=True)
league_data.columns

Index(['season', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D', 'B365A', 'gameId',
       'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [6]:
EMA_data = pd.read_csv('data/EMA_data.csv')
EMA_data.drop(['Unnamed: 0', 'f_DateHome', 'f_seasonHome', 'HomeTeam',
               'homeGame_x', 'f_yellowsAgainstAway', 'f_yellowsForAway',
               'f_yellowsAgainstHome', 'f_yellowsForHome', 'f_DateAway', 'f_seasonAway', 
               'AwayTeam', 'homeGame_y'], 1, inplace=True)
EMA_data.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_halfTimeGoalsAgainstHome',
       'f_halfTimeGoalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_halfTimeGoalsAgainstAway',
       'f_halfTimeGoalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

In [7]:
df = pd.merge(EMA_data, league_data, left_on='gameId', right_index=True)
df.head()

Unnamed: 0,gameId,gameId_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_halfTimeGoalsAgainstHome,f_halfTimeGoalsForHome,...,ATP,B365H,B365D,B365A,gameId_y,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,18,18,10.94,6.55,9.57,19.0,1.49,1.51,0.51,1.51,...,0.0,2.1,3.25,3.4,19,0.5,-0.5,0.0,0.0,0
1,20,20,9.06,6.0,9.41,14.96,1.49,1.49,1.0,0.49,...,0.0,1.83,3.3,4.33,21,0.666667,-1.0,0.666667,0.666667,0
2,21,21,11.49,3.51,12.08,15.49,1.02,0.51,0.0,0.51,...,2.0,2.5,3.25,2.7,22,0.0,0.333333,-1.666667,-1.666667,0
3,22,22,5.47,5.57,11.45,15.02,2.02,0.51,1.53,0.51,...,0.333333,1.44,4.0,7.5,23,0.666667,-1.666667,-0.333333,-0.333333,0
4,23,23,4.47,3.53,14.12,10.45,0.98,1.0,0.49,0.0,...,1.333333,2.2,3.25,3.2,24,1.333333,-1.0,-0.333333,-0.333333,0


# Cleaning and splitting the data

Because the machine -learning model only takes numeric input we will change our labels from strings to integers and use categorical cross-entropy as our loss function. We will also scale our data using sklearn StandardScaler.

First we will seperate the labels from the rest of our data and split into training and testing.

In [8]:
training_data = df.loc[df['season'] != 1920].reset_index(drop=True)
testing_data = df.loc[df['season'] == 1920].reset_index(drop=True)

X = training_data.drop(['gameId', 'gameId_x', 'gameId_y', 'FTR', 
           'season', 'gameId_y', ], 1)
Y = training_data['FTR']

X_test = testing_data.drop(['gameId', 'gameId_x', 'gameId_y', 'FTR', 
           'season', 'gameId_y', ], 1)
y_test = testing_data['FTR']

In [9]:
testing_data.tail()

Unnamed: 0,gameId,gameId_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_halfTimeGoalsAgainstHome,f_halfTimeGoalsForHome,...,ATP,B365H,B365D,B365A,gameId_y,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
123,4685,4685,5.978293,4.809925,11.702112,11.004062,1.420448,1.265252,0.532886,0.465308,...,0.538462,1.36,5.1,8.0,4686,-0.538462,-0.923077,0.538462,0.461538,0
124,4686,4686,4.565659,6.002476,10.557303,11.607718,1.295098,1.288115,0.446414,0.649212,...,1.153846,2.3,3.4,3.1,4687,-1.153846,-0.076923,-0.538462,0.0,0
125,4687,4687,5.55167,4.676105,8.990351,11.080792,1.731395,1.044866,0.835796,0.388996,...,2.0,1.44,5.0,6.5,4688,1.384615,1.0,-0.076923,-0.461538,0
126,4688,4688,2.394241,8.241461,8.321039,9.099733,0.796715,2.561447,0.430477,1.210825,...,1.230769,3.5,3.4,2.1,4689,0.307692,0.0,0.076923,0.153846,0
127,4689,4689,6.506138,6.033564,8.08464,10.255848,0.718892,1.103651,0.316085,0.468612,...,1.153846,3.5,3.4,2.1,4690,-0.076923,-0.461538,-0.307692,-0.307692,0


In [10]:
def transform_results(results):
    transformed = []
    for i in range(len(results)):
        if results[i] == 'H':
            transformed.append(0)
        elif results[i] == 'A':
            transformed.append(2)
        else:
            transformed.append(1)
    return np.array(transformed)
            
Y = transform_results(Y)
y_test = transform_results(y_test)

In [11]:
scaler = StandardScaler()

X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [12]:
print('Number of matches in training data:', X.shape[0])
print('Number of matches in test data:', X_test.shape[0])
print('Number of features:', X.shape[1])

Number of matches in training data: 4503
Number of matches in test data: 128
Number of features: 38


In [13]:
# Split our data. We are using the most recent season for validation
X_train = X[:-380]
y_train = Y[:-380]
X_val = X[-380:]
y_val = Y[-380:]

# Build the model

Now we have cleaned the data we can now create our model and train it. 

In [14]:
# input dimension is number of features
input_dim = X_train.shape[1]

activation_func = 'relu'
kernel_init = 'glorot_normal'
learning_rate = 0.001
batch_size = 16

model = keras.Sequential([
    Dense(48, input_shape=(input_dim,), activation=activation_func),
    Dropout(0.3),
    Dense(16),
    Dropout(0.2),
    Dense(3, activation='softmax')
])

In [16]:
es = EarlyStopping(monitor='loss', patience=3, verbose=1)

In [17]:
from keras.optimizers import Adam, SGD

opt = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [18]:
model.fit(X_train, y_train, batch_size=batch_size, 
              callbacks=[es], epochs=500, verbose=0)


Epoch 00030: early stopping


<keras.callbacks.callbacks.History at 0x7ff12d48ad68>

In [23]:
train_loss, train_acc = model.evaluate(X_train, y_train)
val_loss, val_acc = model.evaluate(X_val, y_val)
print('Training loss:', train_loss)
print('Training accuracy:', train_acc)
print('Validation loss:', val_loss)
print('Validation accuracy:', val_acc)


Training loss: 0.7936528218680148
Training accuracy: 0.6480717658996582
Validation loss: 0.8073749272446883
Validation accuracy: 0.6184210777282715


We are getting around 65% on training and 62% on validation. This is good as is well above the bookies accuracy. Once we are happy with how the model is performing we can check the accuracy on the test set

In [24]:
# Saving best model
# model.save('25Nov19.h5')

In [25]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

Test loss: 0.7197954952716827
Test accuracy: 0.6640625


# Create a betting strategy

Now that we have an accurate model let's see if we can make it as profitable as possible.

Our confusion matrix looks good. I mainly wanted to check the performance for draws as these are difficult to predict, the results are ok.

We will now use the for-loop below to see how much we would have won had we bet. 

In [26]:
funds = 100
wager = 10
favourites = 0
no_bets = 0
min_diff = 0.03

y_preds = model.predict(X_test)

for i in range(len(X_test)):
    prediction = np.argmax(y_preds[i])
    print('\nPrediction', prediction)
    print('Actual', y_test[i])
    print('Favourite', np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]))
    print('Prediction proba', y_preds[i])
    print('Home, Draw and Away odds', testing_data['B365H'][i],       
          testing_data['B365D'][i], testing_data['B365A'][i])
    
    if prediction == 0:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365H'][i])
        if odds_diff > min_diff:
            if prediction == np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]):
                favourites +=1
                
            if  prediction == y_test[i]:
                funds += (wager * testing_data['B365H'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
            
    elif prediction == 1:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365D'][i])
        if odds_diff > min_diff:
            if prediction == np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]):
                favourites +=1
            if  prediction == y_test[i]:
                funds +=( wager * testing_data['B365D'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    else:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365A'][i])
        if odds_diff >  min_diff:
            if prediction == np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]):
                favourites +=1
            if  prediction == y_test[i]:
                funds += (wager * testing_data['B365A'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
   
    print('Funds', funds)
    
print(f'Betted on favourite {favourites} times out of {len(X_test)} matches.')
print(f'No bet placed {no_bets} times')
         


Prediction 0
Actual 0
Favourite 0
Prediction proba [0.6406146  0.2976048  0.06178055]
Home, Draw and Away odds 1.14 10.0 19.0
Funds 100

Prediction 2
Actual 2
Favourite 2
Prediction proba [0.1498708  0.09711154 0.75301766]
Home, Draw and Away odds 12.0 6.5 1.22
Funds 100

Prediction 2
Actual 1
Favourite 0
Prediction proba [0.24385048 0.229821   0.5263285 ]
Home, Draw and Away odds 1.95 3.6 3.6
Funds 90

Prediction 2
Actual 1
Favourite 2
Prediction proba [0.23566064 0.30126706 0.46307233]
Home, Draw and Away odds 3.0 3.25 2.37
Funds 80

Prediction 2
Actual 2
Favourite 0
Prediction proba [0.26342666 0.29570556 0.44086778]
Home, Draw and Away odds 1.9 3.4 4.0
Funds 110.0

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.36956793 0.30569366 0.32473844]
Home, Draw and Away odds 1.3 5.25 10.0
Funds 110.0

Prediction 2
Actual 1
Favourite 0
Prediction proba [0.21831408 0.30487978 0.4768061 ]
Home, Draw and Away odds 2.2 3.2 3.4
Funds 100.0

Prediction 2
Actual 2
Favourite 2
Prediction pr