In [115]:
import numpy as np
import pandas as pd
import calendar
pd.options.display.max_columns = 99
pd.options.display.max_rows = 2000

# Data-processing

In [116]:
# load dataset
E1415 = pd.read_csv('data/E1415.csv')
E1516 = pd.read_csv('data/E1516.csv')
E1617 = pd.read_csv('data/E1617.csv')
E1718 = pd.read_csv('data/E1718test.csv')

# concat dataset
frames = [E1415,E1516,E1617,E1718]
data = pd.concat(frames)[['Date','HomeTeam','AwayTeam','FTR','FTHG','FTAG','Referee','B365H','B365D','B365A']]

# remove nan
data.dropna(how='any',axis=0,inplace=True)

# get month cat
data['month'] = pd.DatetimeIndex(data['Date']).month.astype(int)
data['month']= data['month'].apply(lambda x: calendar.month_abbr[x])

In [117]:
data.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,Referee,B365H,B365D,B365A,month
0,16/08/14,Arsenal,Crystal Palace,H,2.0,1.0,J Moss,1.25,6.5,15.0,Aug
1,16/08/14,Leicester,Everton,D,2.0,2.0,M Jones,3.2,3.4,2.4,Aug
2,16/08/14,Man United,Swansea,A,1.0,2.0,M Dean,1.36,5.0,11.0,Aug
3,16/08/14,QPR,Hull,A,0.0,1.0,C Pawson,2.5,3.3,3.1,Aug
4,16/08/14,Stoke,Aston Villa,A,0.0,1.0,A Taylor,1.95,3.5,4.5,Aug


In [118]:
def data_processing(data):
    df_with_dummies = pd.get_dummies(data, columns = ['HomeTeam','AwayTeam','FTR','Referee','month'])
    df_with_dummies.drop(['Date','FTHG','FTAG'], axis=1, inplace=True)
    return(df_with_dummies)

In [119]:
processed_data=data_processing(data)

In [120]:
processed_data.head()

Unnamed: 0,B365H,B365D,B365A,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Huddersfield,HomeTeam_Hull,HomeTeam_Leicester,HomeTeam_Liverpool,HomeTeam_Man City,HomeTeam_Man United,HomeTeam_Middlesbrough,HomeTeam_Newcastle,HomeTeam_QPR,HomeTeam_Southampton,HomeTeam_Stoke,HomeTeam_Sunderland,HomeTeam_Swansea,HomeTeam_Tottenham,HomeTeam_Watford,HomeTeam_West Brom,HomeTeam_West Ham,AwayTeam_Arsenal,AwayTeam_Aston Villa,AwayTeam_Bournemouth,AwayTeam_Brighton,AwayTeam_Burnley,AwayTeam_Chelsea,AwayTeam_Crystal Palace,AwayTeam_Everton,AwayTeam_Huddersfield,AwayTeam_Hull,AwayTeam_Leicester,AwayTeam_Liverpool,AwayTeam_Man City,AwayTeam_Man United,AwayTeam_Middlesbrough,AwayTeam_Newcastle,AwayTeam_QPR,AwayTeam_Southampton,AwayTeam_Stoke,AwayTeam_Sunderland,AwayTeam_Swansea,AwayTeam_Tottenham,AwayTeam_Watford,AwayTeam_West Brom,AwayTeam_West Ham,FTR_A,FTR_D,FTR_H,Referee_A Marriner,Referee_A Taylor,Referee_C Foy,Referee_C Kavanagh,Referee_C Pawson,Referee_G Scott,Referee_J Moss,Referee_K Friend,Referee_K Stroud,Referee_L Mason,Referee_L Probert,Referee_M Atkinson,Referee_M Clattenburg,Referee_M Dean,Referee_M Jones,Referee_M Oliver,Referee_N Swarbrick,Referee_P Dowd,Referee_P Tierney,Referee_R East,Referee_R Madley,Referee_S Attwell,Referee_l Mason,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
0,1.25,6.5,15.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,3.2,3.4,2.4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1.36,5.0,11.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,2.5,3.3,3.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1.95,3.5,4.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [121]:
processed_data.shape

(1220, 91)

In [122]:
E1718.shape

(80, 65)

# Modelisation

## Data selection

In [123]:
data_model = processed_data.head(processed_data.shape[0]-E1718.shape[0])

In [124]:
from sklearn.model_selection import train_test_split
train_set, validation_set = train_test_split(data_model, test_size = 0.3)

y_train = train_set[['FTR_A','FTR_D','FTR_H']]
x_train  = train_set.drop(['FTR_A','FTR_D','FTR_H'],axis=1)
y_validation = validation_set[['FTR_A','FTR_D','FTR_H']]
x_validation  = validation_set.drop(['FTR_A','FTR_D','FTR_H'],axis=1)


## Neural Network (with Keras)

In [125]:
from keras.models import Sequential
from keras.layers import Dense,Dropout

In [126]:
model = Sequential()
model.add(Dense(200, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(500, activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(500, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(3,activation='softmax'))

In [127]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [128]:
model.fit(x_train.values,y_train.values, epochs=20, batch_size=10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1272ac908>

In [129]:
scores = model.evaluate(x_validation.values,y_validation.values)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


acc: 60.23%


## Final test

In [130]:
data_end = processed_data.tail(80)

In [131]:
data_end.head()

Unnamed: 0,B365H,B365D,B365A,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Huddersfield,HomeTeam_Hull,HomeTeam_Leicester,HomeTeam_Liverpool,HomeTeam_Man City,HomeTeam_Man United,HomeTeam_Middlesbrough,HomeTeam_Newcastle,HomeTeam_QPR,HomeTeam_Southampton,HomeTeam_Stoke,HomeTeam_Sunderland,HomeTeam_Swansea,HomeTeam_Tottenham,HomeTeam_Watford,HomeTeam_West Brom,HomeTeam_West Ham,AwayTeam_Arsenal,AwayTeam_Aston Villa,AwayTeam_Bournemouth,AwayTeam_Brighton,AwayTeam_Burnley,AwayTeam_Chelsea,AwayTeam_Crystal Palace,AwayTeam_Everton,AwayTeam_Huddersfield,AwayTeam_Hull,AwayTeam_Leicester,AwayTeam_Liverpool,AwayTeam_Man City,AwayTeam_Man United,AwayTeam_Middlesbrough,AwayTeam_Newcastle,AwayTeam_QPR,AwayTeam_Southampton,AwayTeam_Stoke,AwayTeam_Sunderland,AwayTeam_Swansea,AwayTeam_Tottenham,AwayTeam_Watford,AwayTeam_West Brom,AwayTeam_West Ham,FTR_A,FTR_D,FTR_H,Referee_A Marriner,Referee_A Taylor,Referee_C Foy,Referee_C Kavanagh,Referee_C Pawson,Referee_G Scott,Referee_J Moss,Referee_K Friend,Referee_K Stroud,Referee_L Mason,Referee_L Probert,Referee_M Atkinson,Referee_M Clattenburg,Referee_M Dean,Referee_M Jones,Referee_M Oliver,Referee_N Swarbrick,Referee_P Dowd,Referee_P Tierney,Referee_R East,Referee_R Madley,Referee_S Attwell,Referee_l Mason,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
0,1.53,4.5,6.5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,11.0,5.5,1.33,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,1.25,6.5,15.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,1.83,3.6,5.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,1.7,3.8,5.75,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [132]:
y_test = data_end[['FTR_A','FTR_D','FTR_H']]
x_test  = data_end.drop(['FTR_A','FTR_D','FTR_H'],axis=1)

In [133]:
predictions = pd.DataFrame(model.predict(x_test.values))

In [134]:
scores = model.evaluate(x_test.values,y_test.values)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 47.50%


In [135]:
predictions.columns=['FTR_A','FTR_D','FTR_H']
predictions.round(2).head(10)

Unnamed: 0,FTR_A,FTR_D,FTR_H
0,0.09,0.21,0.7
1,0.91,0.08,0.01
2,0.0,0.02,0.98
3,0.29,0.33,0.38
4,0.12,0.25,0.63
5,0.01,0.07,0.92
6,0.81,0.16,0.03
7,0.0,0.03,0.97
8,0.15,0.27,0.58
9,0.96,0.04,0.0


In [136]:
y_test.head(10)

Unnamed: 0,FTR_A,FTR_D,FTR_H
0,0,0,1
1,1,0,0
2,1,0,0
3,1,0,0
4,0,0,1
5,0,1,0
6,0,1,0
7,0,0,1
8,0,0,1
9,1,0,0
