## Import dataset

In [1]:
import pandas as pd

data = pd.read_csv('../data/combined/combined_data.csv')
print(data.shape[0], 'matches')
data.head(5)

5681 matches


Unnamed: 0,Date,HomeTeam,AwayTeam,HTHG,HTAG,HTR,FTHG,FTAG,FTR,HS,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,14/08/10,Aston Villa,West Ham,2.0,0.0,H,3.0,0.0,H,23.0,...,11.0,2.0,16.0,7.0,15.0,15.0,1.0,2.0,0.0,0.0
1,14/08/10,Blackburn,Everton,1.0,0.0,H,1.0,0.0,H,7.0,...,2.0,12.0,1.0,3.0,19.0,14.0,2.0,1.0,0.0,0.0
2,14/08/10,Bolton,Fulham,0.0,0.0,D,0.0,0.0,D,13.0,...,9.0,7.0,4.0,8.0,12.0,13.0,1.0,3.0,0.0,0.0
3,14/08/10,Chelsea,West Brom,2.0,0.0,H,6.0,0.0,H,18.0,...,13.0,4.0,3.0,1.0,10.0,10.0,1.0,0.0,0.0,0.0
4,14/08/10,Sunderland,Birmingham,1.0,0.0,H,2.0,2.0,D,6.0,...,2.0,7.0,3.0,6.0,13.0,10.0,3.0,3.0,1.0,0.0


In [2]:
curr_df = pd.read_csv('../data/season-2023-2024.csv')

home_teams = curr_df['HomeTeam']

team_list = []
for t in home_teams:
    if t not in team_list:
        team_list.append(t)
        
print(len(team_list), 'Teams in Season')
i=1
for team in team_list:
    print(i, team)
    i+=1

20 Teams in Season
1 Burnley
2 Arsenal
3 Bournemouth
4 Brighton
5 Everton
6 Sheffield United
7 Newcastle
8 Brentford
9 Chelsea
10 Man United
11 Nott'm Forest
12 Fulham
13 Liverpool
14 Wolves
15 Tottenham
16 Man City
17 Aston Villa
18 West Ham
19 Crystal Palace
20 Luton


## Preprocess data

### Keep records of only the teams in the current season

In [3]:
filtered = data[(data['HomeTeam'].isin(team_list))]
data = filtered[(filtered['AwayTeam'].isin(team_list))]

print(data.shape, 'records')

(2823, 21) records


In [4]:
print(data.columns.tolist())


['Date', 'HomeTeam', 'AwayTeam', 'HTHG', 'HTAG', 'HTR', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']


### Prepare features and label

In [5]:
X = data.drop(['FTR'], axis=1)
Z = X.drop(['Date', 'HTR'], axis=1)
print(Z.shape[1], 'Features')
for c in Z.columns:
    print(c)
y = data['FTR']

18 Features
HomeTeam
AwayTeam
HTHG
HTAG
FTHG
FTAG
HS
AS
HST
AST
HC
AC
HF
AF
HY
AY
HR
AR


### Scale and standardise the feature data
* Center to the mean and component wise scale to unit variance

In [6]:
from sklearn.preprocessing import scale

cols = [['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
        'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']]
for col in cols:
    X[col] = scale(X[col])
    Z[col] = scale(Z[col])
    
print(Z.shape)
Z.tail()

(2823, 18)


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
5670,West Ham,Man United,-0.825818,-0.728039,-1.198603,-1.05848,-0.901365,0.922715,-1.089266,0.654864,-1.526842,0.461671,0.417196,0.32055,-0.528097,-0.674923,-0.240276,-0.271751
5671,Burnley,Bournemouth,0.35041,-0.728039,-0.440446,0.619279,-0.379191,0.922715,-0.425979,0.266649,0.355237,1.18653,0.996224,-0.518368,-1.310678,-1.430098,-0.240276,-0.271751
5674,Liverpool,Brighton,1.526639,-0.728039,1.834026,-1.05848,1.361389,-1.907299,1.895525,-1.286213,0.355237,-0.625618,-2.188432,-1.357286,-1.310678,-1.430098,-0.240276,-0.271751
5676,Newcastle,Chelsea,0.35041,-0.728039,1.075869,-1.05848,0.317041,-1.098723,0.237308,-0.897998,-0.585803,-0.988048,0.127682,-0.238729,-1.310678,-0.674923,-0.240276,-0.271751
5680,West Ham,Everton,0.35041,-0.728039,1.075869,-0.2196,0.142983,0.518428,-0.425979,1.04308,0.041557,0.461671,-0.161833,0.60019,-1.310678,-0.674923,-0.240276,-0.271751


### Handle categorical values
* Input data needs to be continous variables that are integers
* Convert to dummy variables

In [7]:
def preprocess(Z):
    df = pd.DataFrame(index=Z.index)
    for col, data in Z.items():  # use .items() instead of .iteritems()
        if data.dtype == object:
            data = pd.get_dummies(data, prefix=col)
        df = df.join(data)
    return df

Z = preprocess(Z)

### Feature information

In [18]:
print('\nFeature values:')
Z.tail()


Feature values:


Unnamed: 0,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brentford,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Fulham,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
5670,False,False,False,False,False,False,False,False,False,False,...,-1.089266,0.654864,-1.526842,0.461671,0.417196,0.32055,-0.528097,-0.674923,-0.240276,-0.271751
5671,False,False,False,False,False,True,False,False,False,False,...,-0.425979,0.266649,0.355237,1.18653,0.996224,-0.518368,-1.310678,-1.430098,-0.240276,-0.271751
5674,False,False,False,False,False,False,False,False,False,False,...,1.895525,-1.286213,0.355237,-0.625618,-2.188432,-1.357286,-1.310678,-1.430098,-0.240276,-0.271751
5676,False,False,False,False,False,False,False,False,False,False,...,0.237308,-0.897998,-0.585803,-0.988048,0.127682,-0.238729,-1.310678,-0.674923,-0.240276,-0.271751
5680,False,False,False,False,False,False,False,False,False,False,...,-0.425979,1.04308,0.041557,0.461671,-0.161833,0.60019,-1.310678,-0.674923,-0.240276,-0.271751


### Split data into training and test sets

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size = 50,
                                                    random_state = 2,
                                                    stratify = y)
print('Training data:',len(X_train))
print('Test data:',len(X_test))

Training data: 2773
Test data: 50


## Create models
Classifiers:
* Logistic Regression
* Support Vector Classifier
* K-Nearest Neighbors

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

clf_lr = LogisticRegression(random_state=42)
clf_svc = SVC(kernel='poly', random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=10)

## Train and Evaluate the models

* Train the model
* Test based on the F1 score and Accuarcy
  * F1 score considers both the precision and the recall of the test to compute the score
  * The F1 score can be interpreted as a weighted average of the precision and recall
  * F1 score reaches its best value at 1 and worst at 0.
  * Accuracy is the ratio of correct predictions to the total predictions

In [11]:
from time import time
from sklearn.metrics import f1_score

def train(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print('Model trained in {:.4f} secs'.format(end-start))

def test(clf, features, labels):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print('Test predictions made in {:.4f} secs'.format(end-start))
    f1 = f1_score(labels, y_pred, average='macro')
    acc = sum(labels==y_pred)/float(len(y_pred))
    return f1, acc

def train_test(clf, X_train, y_train, X_test, y_test):
    print('Training {}...'.format(clf.__class__.__name__))
    train(clf, X_train, y_train)
    f1, acc = test(clf, X_train, y_train)
    print('For Training set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    f1, acc = test(clf, X_test, y_test)
    print('For Test set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    
train_test(clf_lr, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_svc, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_knn, X_train, y_train, X_test, y_test)
print('------------------------------------------------')

Training LogisticRegression...
Model trained in 0.0229 secs
Test predictions made in 0.0020 secs
For Training set: F1 score= 1.0000, Accuracy= 1.0000
Test predictions made in 0.0009 secs
For Test set: F1 score= 1.0000, Accuracy= 1.0000
------------------------------------------------
Training SVC...
Model trained in 0.1427 secs
Test predictions made in 0.0668 secs
For Training set: F1 score= 0.9583, Accuracy= 0.9657
Test predictions made in 0.0018 secs
For Test set: F1 score= 0.9101, Accuracy= 0.9200
------------------------------------------------
Training KNeighborsClassifier...
Model trained in 0.0029 secs
Test predictions made in 0.0795 secs
For Training set: F1 score= 0.8116, Accuracy= 0.8417
Test predictions made in 0.0035 secs
For Test set: F1 score= 0.7834, Accuracy= 0.8200
------------------------------------------------


## Use the best model for making predictions
* Set the model
* Train the model with training dataset
* Make predictions
* Predict the probability of results (Away team win, draw, Home team win)

In [12]:
model = SVC(kernel='poly', random_state=42)
model.fit(X_train, y_train)

In [13]:
pred = model.predict(X_test)
pred_df = pd.DataFrame(pred, columns=['Prediction'])

if(model.probability):
    pred_prob = model.predict_proba(X_test) 
    pred_prob_df = pd.DataFrame(pred_prob, columns=['Away Win %', 'Draw %', 'Home Win %']) 
    pred_prob_df = pred_prob_df.round(6)*100
    prediction_df = pd.concat([pred_df, pred_prob_df], axis=1) 
else:
    prediction_df = pred_df
    
prediction_df.head()

Unnamed: 0,Prediction
0,H
1,A
2,H
3,A
4,D


## Incorporate the result probabilities into the fixture

In [14]:
fixtures = pd.read_csv('../data/fixture/epl-2023-GMT.csv')

print(fixtures.shape[0], 'matches')
fixtures.head()

380 matches


Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result
0,1,1,11/08/2023 20:00,Turf Moor,Burnley,Man City,0 - 3
1,2,1,12/08/2023 13:00,Emirates Stadium,Arsenal,Nottingham Forest,2 - 1
2,3,1,12/08/2023 15:00,Vitality Stadium,Bournemouth,West Ham,1 - 1
3,4,1,12/08/2023 15:00,Amex Stadium,Brighton,Luton,4 - 1
4,5,1,12/08/2023 15:00,Goodison Park,Everton,Fulham,0 - 1


In [15]:
fixtures = fixtures.drop(['Match Number','Round Number', 'Date', 'Location', 'Result'], axis=1)

fixtures.columns = ['HomeTeam', 'AwayTeam']
fixtures['HTHG'] = 0
fixtures['HTAG'] = 0
fixtures['FTHG'] = 0
fixtures['FTAG'] = 0
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0

print(fixtures.shape, 'features')
fixtures.head()

(380, 18) features


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,Burnley,Man City,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Arsenal,Nottingham Forest,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bournemouth,West Ham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Brighton,Luton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Everton,Fulham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
pp_fixtures = preprocess(fixtures)

In [17]:
model.predict(pp_fixtures)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- AwayTeam_Man Utd
- AwayTeam_Nottingham Forest
- AwayTeam_Sheffield Utd
- AwayTeam_Spurs
- HomeTeam_Man Utd
- ...
Feature names seen at fit time, yet now missing:
- AwayTeam_Man United
- AwayTeam_Nott'm Forest
- AwayTeam_Sheffield United
- AwayTeam_Tottenham
- HomeTeam_Man United
- ...


In [None]:
fixtures['Prediction'] = model.predict(pp_fixtures)

In [None]:
Result = fixtures.drop(['FTHG','FTAG','HTHG','HTAG','HS','AS',
                        'HST','AST','HF','AF','HC','AC',
                        'HY','AY','HR','AR'],1)


Result.tail(10)

In [None]:
if(model.probability):
    fixture_pred_prob = model.predict_proba(pp_fixtures) *100
    fixture_pred_prob = pd.DataFrame(fixture_pred_prob, columns=['Away win %','Draw %','Home win %'])
    display(fixture_pred_prob)