In [179]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

from keras import backend as K
config = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
tf.set_random_seed(42)
sess = tf.Session(config=config)
K.set_session(sess)

import keras
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler



DATA_PATH = 'data/'

In [180]:
league_data = pd.read_csv('data/league_data.csv')
league_data.drop(['Unnamed: 0', 'Date', 'HomeTeam', 'AwayTeam', 
                 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HM1', 'HM2', 'HM3', 
                 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPts',
                 'ATFormPts', 'MW', 'HTFormPtsStr', 'ATFormPtsStr'], 1, inplace=True)
league_data.columns

Index(['season', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D', 'B365A', 'gameId',
       'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [181]:
EMA_data = pd.read_csv('data/EMA_data.csv')
EMA_data.drop(['Unnamed: 0', 'f_DateHome', 'f_seasonHome', 'HomeTeam',
               'homeGame_x', 'f_yellowsAgainstAway', 'f_yellowsForAway',
               'f_halfTimeGoalsAgainstHome', 'f_halfTimeGoalsForHome','f_halfTimeGoalsAgainstAway', 'f_halfTimeGoalsForAway',
               'f_yellowsAgainstHome', 'f_yellowsForHome', 'f_DateAway', 'f_seasonAway', 
               'AwayTeam', 'homeGame_y'], 1, inplace=True)
EMA_data.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

In [182]:
df = pd.merge(EMA_data, league_data, left_on='gameId', right_index=True)

In [183]:
df.head()

Unnamed: 0,gameId,gameId_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,...,ATP,B365H,B365D,B365A,gameId_y,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,18,18,10.94,6.55,9.57,19.0,1.49,1.51,0.0,0.0,...,0.0,2.1,3.25,3.4,19,0.5,-0.5,0.0,0.0,0
1,20,20,9.06,6.0,9.41,14.96,1.49,1.49,0.51,0.51,...,0.0,1.83,3.3,4.33,21,0.666667,-1.0,0.666667,0.666667,0
2,21,21,11.49,3.51,12.08,15.49,1.02,0.51,0.0,1.0,...,2.0,2.5,3.25,2.7,22,0.0,0.333333,-1.666667,-1.666667,0
3,22,22,5.47,5.57,11.45,15.02,2.02,0.51,0.0,0.0,...,0.333333,1.44,4.0,7.5,23,0.666667,-1.666667,-0.333333,-0.333333,0
4,23,23,4.47,3.53,14.12,10.45,0.98,1.0,0.0,0.0,...,1.333333,2.2,3.25,3.2,24,1.333333,-1.0,-0.333333,-0.333333,0


# Prepare and split the data

Now that we have loaded our data into a dataframe we will clean the data for our Keras model. Because the model only takes numeric input we will change our labels from strings to integers and use categorical cross-entropy as our loss function. We will also scale our data using sklearn StandardScaler.

First we will seperate the labels from the rest of our data.

In [184]:
training_data = df.loc[df['season'] != 1920].reset_index(drop=True)
testing_data = df.loc[df['season'] == 1920].reset_index(drop=True)

X = training_data.drop(['gameId', 'gameId_x', 'gameId_y', 'FTR', 
           'season', 'gameId_y', ], 1)
Y = training_data['FTR']

X_test = testing_data.drop(['gameId', 'gameId_x', 'gameId_y', 'FTR', 
           'season', 'gameId_y', ], 1)
y_test = testing_data['FTR']

# Use 'season' columns to create training batches
seasons = training_data['season']

In [185]:
testing_data.tail()

Unnamed: 0,gameId,gameId_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,...,ATP,B365H,B365D,B365A,gameId_y,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
123,4685,4685,5.978293,4.809925,11.702112,11.004062,1.420448,1.265252,0.118111,0.019778,...,0.538462,1.36,5.1,8.0,4686,-0.538462,-0.923077,0.538462,0.461538,0
124,4686,4686,4.565659,6.002476,10.557303,11.607718,1.295098,1.288115,0.098173,0.10845,...,1.153846,2.3,3.4,3.1,4687,-1.153846,-0.076923,-0.538462,0.0,0
125,4687,4687,5.55167,4.676105,8.990351,11.080792,1.731395,1.044866,0.0409,0.111778,...,2.0,1.44,5.0,6.5,4688,1.384615,1.0,-0.076923,-0.461538,0
126,4688,4688,2.394241,8.241461,8.321039,9.099733,0.796715,2.561447,0.030523,0.051923,...,1.230769,3.5,3.4,2.1,4689,0.307692,0.0,0.076923,0.153846,0
127,4689,4689,6.506138,6.033564,8.08464,10.255848,0.718892,1.103651,0.0,0.077736,...,1.153846,3.5,3.4,2.1,4690,-0.076923,-0.461538,-0.307692,-0.307692,0


In [186]:
def transform_results(results):
    transformed = []
    for i in range(len(results)):
        if results[i] == 'H':
            transformed.append(0)
        elif results[i] == 'A':
            transformed.append(2)
        else:
            transformed.append(1)
    return np.array(transformed)
            
Y = transform_results(Y)
y_test = transform_results(y_test)

In [187]:
scaler = StandardScaler()

X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [188]:
print('Number of matches in training data:', X.shape[0])
print('Number of matches in test data:', X_test.shape[0])
print('Number of features:', X.shape[1])

Number of matches in training data: 4503
Number of matches in test data: 128
Number of features: 34


In [189]:
# Split our data. We are using the most recent season for validation
X_train = X[:-380]
y_train = Y[:-380]
X_val = X[-380:]
y_val = Y[-380:]

# Build the model

Now we have cleaned the data we can now create our model and train it. 

In [190]:

input_dim = X_train.shape[1]

activation_func = 'relu'
kernel_init = 'glorot_normal'
dropout_rate = 0.4
learning_rate = 0.001
batch_size = 16

model = keras.Sequential([
    Dense(50, input_shape=(input_dim,), activation=activation_func),
    Dropout(0.3),
    Dense(16),
    Dropout(0.2),
    Dense(3, activation='softmax')
])

In [191]:
es = EarlyStopping(monitor='loss', patience=3, verbose=1)

In [192]:
from keras.optimizers import Adam, SGD

opt = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
# opt = SGD(momentum=0.0, learning_rate=0.01, nesterov=False)

In [193]:
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, 
              callbacks=[es], epochs=500, verbose=0)

Epoch 00035: early stopping


<keras.callbacks.callbacks.History at 0x7f31e2c1b9b0>

In [194]:
_, train_acc = model.evaluate(X_train, y_train)
_, val_acc = model.evaluate(X_val, y_val)
print('Training accuracy:', train_acc)
print('Validation accuracy:', val_acc)

Training accuracy: 0.6439485549926758
Validation accuracy: 0.6105263233184814


We are getting around 63% on training and 64% on validation. This is good as is above the bookies accuracy. Because our validation data is so small we will also look at our training and validation across all the seasons to see if it is consistant. 

In [211]:
# Saving best model
# model.save('25Nov19.h5')

In [196]:
# idx = 0
# train_results = []
# val_results = []

# for season_len in seasons.value_counts():
#     X_train = np.concatenate((X[:idx], X[idx+season_len:]))
#     y_train = np.concatenate((Y[:idx], Y[idx+season_len:]))
#     X_val = X[idx:idx+season_len]
#     y_val = Y[idx:idx+season_len]
    
#     model = dnn_model
#     model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
#     model.fit(X_train, y_train, batch_size=batch_size, 
#                     callbacks=[es], epochs=500, verbose=0)
    
#     _, train_acc = model.evaluate(X_train, y_train)
#     _, val_acc = model.evaluate(X_val, y_val)
#     print('Training accuracy:', train_acc)
#     print('Validation accuracy:', val_acc)
    
#     train_results.append(train_acc)
#     val_results.append(val_acc)
#     idx +=season_len



In [197]:
# print(f'Average training accuracy :{sum(train_results)/len(train_results)}')
# print(f'Min accuracy :{min(train_results)}')m
# print(f'Max accuracy :{max(train_results)}')
# print(f'Standard Deviation :{np.std(train_results)}')

In [198]:
# print(f'Average validation accuracy :{sum(val_results)/len(val_results)}')
# print(f'Min accuracy :{min(val_results)}')
# print(f'Max accuracy :{max(val_results)}')
# print(f'Standard Deviation :{np.std(val_results)}')

Let's now check our accuracy on the test set and see how accurate this is. We can then look at creating a betting strategy and see if the model would be profitable. 

In [199]:
_, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.6328125


In [200]:
y_preds = model.predict(X_test)

In [201]:
# Save predictions 
np.save('y_preds.npy', y_preds)
# Save X_test to check predictions data
np.save('X_test.npy', X_test)

In [202]:
X_test.shape

(128, 34)

# Create a betting strategy

Now that we have an accurate model let's see if we can make it as profitable as possible.

Our confusion matrix looks good. I mainly wanted to check the performance for draws as these are difficult to predict, the results are ok.

We will now use the for-loop below to see how much we would have won had we bet. 

In [203]:
funds = 100
wager = 10
favourites = 0
no_bets = 0
min_diff = 0.01


for i in range(len(X_test)):
    prediction = np.argmax(y_preds[i])
    print('\nPrediction', prediction)
    print('Actual', y_test[i])
    print('Favourite', np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]))
    print('Prediction proba', y_preds[i])
    print('Home, Draw and Away odds', testing_data['B365H'][i],       
          testing_data['B365D'][i], testing_data['B365A'][i])
    
        

    if prediction == 0:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365H'][i])
        # If odds_diff positive place bet
        if odds_diff > min_diff:
            if prediction == np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]):
                favourites +=1
                
            if  prediction == y_test[i]:
                funds += (wager * testing_data['B365H'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    elif prediction == 1:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365D'][i])
        if odds_diff > min_diff:
            if prediction == np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]):
                favourites +=1
            if  prediction == y_test[i]:
                funds +=( wager * testing_data['B365D'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    else:
        odds_diff = y_preds[i][prediction] - (1/testing_data['B365A'][i])
        if odds_diff >  min_diff:
            if prediction == np.argmin([testing_data['B365H'][i], testing_data['B365D'][i], 
                                  testing_data['B365A'][i]]):
                favourites +=1
            if  prediction == y_test[i]:
                funds += (wager * testing_data['B365A'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
   
    print('Funds', funds)
    
print(f'Betted on favourite {favourites} times out of {len(X_test)} matches.')
print(f'No bet placed {no_bets} times')
         


Prediction 0
Actual 0
Favourite 0
Prediction proba [0.71094084 0.24186964 0.04718944]
Home, Draw and Away odds 1.14 10.0 19.0
Funds 100

Prediction 2
Actual 2
Favourite 2
Prediction proba [0.21802792 0.2043864  0.57758576]
Home, Draw and Away odds 12.0 6.5 1.22
Funds 100

Prediction 2
Actual 1
Favourite 0
Prediction proba [0.3453845  0.2382031  0.41641232]
Home, Draw and Away odds 1.95 3.6 3.6
Funds 90

Prediction 2
Actual 1
Favourite 2
Prediction proba [0.2856585  0.34162766 0.37271386]
Home, Draw and Away odds 3.0 3.25 2.37
Funds 90

Prediction 2
Actual 2
Favourite 0
Prediction proba [0.2407786  0.2396996  0.51952183]
Home, Draw and Away odds 1.9 3.4 4.0
Funds 120.0

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.44471833 0.25987858 0.29540303]
Home, Draw and Away odds 1.3 5.25 10.0
Funds 120.0

Prediction 2
Actual 1
Favourite 0
Prediction proba [0.22450626 0.31742042 0.45807338]
Home, Draw and Away odds 2.2 3.2 3.4
Funds 110.0

Prediction 2
Actual 2
Favourite 2
Prediction pr

In [204]:
74 / 91

0.8131868131868132

In [205]:
y_preds_argmax = [np.argmax(i) for i in y_preds]

In [206]:
y_preds_argmax

[0,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 2,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 2,
 2,
 2,
 1,
 2,
 0,
 0,
 0]

In [207]:
118 - 78

40

In [208]:
17 / 40

0.425

In [209]:
679 / 40

16.975

In [210]:
len(X_test)

128