## Import dataset

In [1]:
import pandas as pd

data = pd.read_csv('../data/combined/combined_data.csv')
print(data.shape[0], 'matches')
data.head(5)

5681 matches


Unnamed: 0,Date,HomeTeam,AwayTeam,HTHG,HTAG,HTR,FTHG,FTAG,FTR,HS,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,14/08/10,Aston Villa,West Ham,2.0,0.0,H,3.0,0.0,H,23.0,...,11.0,2.0,16.0,7.0,15.0,15.0,1.0,2.0,0.0,0.0
1,14/08/10,Blackburn,Everton,1.0,0.0,H,1.0,0.0,H,7.0,...,2.0,12.0,1.0,3.0,19.0,14.0,2.0,1.0,0.0,0.0
2,14/08/10,Bolton,Fulham,0.0,0.0,D,0.0,0.0,D,13.0,...,9.0,7.0,4.0,8.0,12.0,13.0,1.0,3.0,0.0,0.0
3,14/08/10,Chelsea,West Brom,2.0,0.0,H,6.0,0.0,H,18.0,...,13.0,4.0,3.0,1.0,10.0,10.0,1.0,0.0,0.0,0.0
4,14/08/10,Sunderland,Birmingham,1.0,0.0,H,2.0,2.0,D,6.0,...,2.0,7.0,3.0,6.0,13.0,10.0,3.0,3.0,1.0,0.0


In [2]:
curr_df = pd.read_csv('../data/season-2023-2024.csv')

home_teams = curr_df['HomeTeam']

team_list = []
for t in home_teams:
    if t not in team_list:
        team_list.append(t)
        
print(len(team_list), 'Teams in Season')
i=1
for team in team_list:
    print(i, team)
    i+=1

20 Teams in Season
1 Burnley
2 Arsenal
3 Bournemouth
4 Brighton
5 Everton
6 Sheffield United
7 Newcastle
8 Brentford
9 Chelsea
10 Man United
11 Nott'm Forest
12 Fulham
13 Liverpool
14 Wolves
15 Tottenham
16 Man City
17 Aston Villa
18 West Ham
19 Crystal Palace
20 Luton


## Preprocess data

### Keep records of only the teams in the current season

In [3]:
filtered = data[(data['HomeTeam'].isin(team_list))]
data = filtered[(filtered['AwayTeam'].isin(team_list))]

print(data.shape, 'records')

(2823, 21) records


In [7]:
print(data.columns.tolist())


['Date', 'HomeTeam', 'AwayTeam', 'HTHG', 'HTAG', 'HTR', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']


### Prepare features and label

In [5]:
X = data.drop(['FTR'], axis=1)
Z = X.drop(['Date', 'HTR'], axis=1)
print(Z.shape[1], 'Features')
for c in Z.columns:
    print(c)
y = data['FTR']

18 Features
HomeTeam
AwayTeam
HTHG
HTAG
FTHG
FTAG
HS
AS
HST
AST
HC
AC
HF
AF
HY
AY
HR
AR


### Scale and standardise the feature data
* Center to the mean and component wise scale to unit variance

In [8]:
from sklearn.preprocessing import scale

cols = [['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
        'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']]
for col in cols:
    X[col] = scale(X[col])
    Z[col] = scale(Z[col])
    
print(Z.shape)
Z.tail()

(2823, 18)


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
5675,Bournemouth,Aston Villa,-0.825818,0.575229,-1.198603,-0.2196,-0.727307,-1.098723,-0.425979,-0.509782,1.296277,-1.712907,2.443795,0.32055,1.819647,0.835427,-0.240276,3.398838
5676,Newcastle,Chelsea,0.35041,-0.728039,0.317712,-1.05848,0.142983,-0.290148,0.237308,-0.509782,-1.213162,1.18653,-0.451347,0.60019,1.819647,0.080252,-0.240276,3.398838
5677,Man United,West Ham,-0.825818,0.575229,-1.198603,0.619279,1.013273,-0.492292,-0.094336,-0.121567,0.668917,0.099241,-2.188432,0.60019,-1.310678,-0.674923,-0.240276,-0.271751
5679,Tottenham,Crystal Palace,-0.825818,0.575229,-1.198603,0.619279,-1.075423,2.337723,-1.420909,2.207726,-0.585803,1.18653,0.417196,-0.238729,-0.528097,-0.674923,-0.240276,-0.271751
5680,Liverpool,Arsenal,1.526639,-0.728039,0.317712,0.619279,-0.031075,0.720572,-0.094336,1.04308,-0.899483,-1.350477,0.70671,-0.238729,-0.528097,-0.674923,-0.240276,3.398838


### Handle categorical values
* Input data needs to be continous variables that are integers
* Convert to dummy variables

In [39]:
def preprocess(Z):
    df = pd.DataFrame(index=Z.index)
    for col in Z.columns:
        data = Z[col]
        if data.dtype == object:
            dummies = pd.get_dummies(data, prefix=col).astype(int)
            df = df.join(dummies)
        elif data.dtype == bool:
            df[col] = data.astype(int)
        else:
            df[col] = data  # Numeric columns as-is
    return df


### Feature information

In [40]:
print('\nFeature values:')
Z.tail()


Feature values:


Unnamed: 0,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brentford,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Fulham,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
5675,0,0,1,0,0,0,0,0,0,0,...,-0.425979,-0.509782,1.296277,-1.712907,2.443795,0.32055,1.819647,0.835427,-0.240276,3.398838
5676,0,0,0,0,0,0,0,0,0,0,...,0.237308,-0.509782,-1.213162,1.18653,-0.451347,0.60019,1.819647,0.080252,-0.240276,3.398838
5677,0,0,0,0,0,0,0,0,0,0,...,-0.094336,-0.121567,0.668917,0.099241,-2.188432,0.60019,-1.310678,-0.674923,-0.240276,-0.271751
5679,0,0,0,0,0,0,0,0,0,0,...,-1.420909,2.207726,-0.585803,1.18653,0.417196,-0.238729,-0.528097,-0.674923,-0.240276,-0.271751
5680,0,0,0,0,0,0,0,0,0,0,...,-0.094336,1.04308,-0.899483,-1.350477,0.70671,-0.238729,-0.528097,-0.674923,-0.240276,3.398838


### Split data into training and test sets

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size = 50,
                                                    random_state = 2,
                                                    stratify = y)
print('Training data:',len(X_train))
print('Test data:',len(X_test))

Training data: 2773
Test data: 50


## Create models
Classifiers:
* Logistic Regression
* Support Vector Classifier
* K-Nearest Neighbors

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

clf_lr = LogisticRegression(random_state=42)
clf_svc = SVC(kernel='poly', random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=10)

## Train and Evaluate the models

* Train the model
* Test based on the F1 score and Accuarcy
  * F1 score considers both the precision and the recall of the test to compute the score
  * The F1 score can be interpreted as a weighted average of the precision and recall
  * F1 score reaches its best value at 1 and worst at 0.
  * Accuracy is the ratio of correct predictions to the total predictions

In [15]:
from time import time
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import numpy as np

def train(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print('Model trained in {:.4f} secs'.format(end-start))

def test(clf, features, labels):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print('Test predictions made in {:.4f} secs'.format(end-start))
    f1 = f1_score(labels, y_pred, average='macro')
    acc = sum(labels == y_pred) / float(len(y_pred))
    return f1, acc

def cross_validate(clf, X, y, cv=5):
    print(f"Running {cv}-Fold Cross Validation on {clf.__class__.__name__}...")
    scores = cross_val_score(clf, X, y, cv=cv, scoring='f1_macro')
    print(f"{cv}-Fold F1 scores: {scores}")
    print(f"Average F1 score: {np.mean(scores):.4f}\n")

def train_test(clf, X_train, y_train, X_test, y_test):
    print('Training {}...'.format(clf.__class__.__name__))
    train(clf, X_train, y_train)
    f1, acc = test(clf, X_train, y_train)
    print('For Training set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    f1, acc = test(clf, X_test, y_test)
    print('For Test set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    cross_validate(clf, X_train, y_train, cv=5)  # You can adjust cv to any number like 10

# Example classifiers (make sure these are defined beforehand)
# clf_lr = LogisticRegression()
# clf_svc = SVC()
# clf_knn = KNeighborsClassifier()

train_test(clf_lr, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_svc, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_knn, X_train, y_train, X_test, y_test)
print('------------------------------------------------')


Training LogisticRegression...
Model trained in 0.2302 secs
Test predictions made in 0.0090 secs
For Training set: F1 score= 1.0000, Accuracy= 1.0000
Test predictions made in 0.0071 secs
For Test set: F1 score= 1.0000, Accuracy= 1.0000
Running 5-Fold Cross Validation on LogisticRegression...
5-Fold F1 scores: [1. 1. 1. 1. 1.]
Average F1 score: 1.0000

------------------------------------------------
Training SVC...
Model trained in 0.9627 secs
Test predictions made in 0.4083 secs
For Training set: F1 score= 0.9595, Accuracy= 0.9668
Test predictions made in 0.0060 secs
For Test set: F1 score= 0.8609, Accuracy= 0.8800
Running 5-Fold Cross Validation on SVC...
5-Fold F1 scores: [0.86907893 0.86229786 0.86579368 0.87527546 0.86298614]
Average F1 score: 0.8671

------------------------------------------------
Training KNeighborsClassifier...
Model trained in 0.0144 secs


[WinError 2] The system cannot find the file specified
  File "C:\Users\dendu\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\dendu\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dendu\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\dendu\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Test predictions made in 0.9098 secs
For Training set: F1 score= 0.8154, Accuracy= 0.8449
Test predictions made in 0.0390 secs
For Test set: F1 score= 0.7101, Accuracy= 0.7400
Running 5-Fold Cross Validation on KNeighborsClassifier...
5-Fold F1 scores: [0.74531213 0.72097688 0.75363464 0.73241342 0.71432017]
Average F1 score: 0.7333

------------------------------------------------


## Use the best model for making predictions
* Set the model
* Train the model with training dataset
* Make predictions
* Predict the probability of results (Away team win, draw, Home team win)

In [16]:
model = SVC(kernel='poly',probability=True, random_state=42)
model.fit(X_train, y_train)

In [18]:
X_test

Unnamed: 0,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brentford,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Fulham,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
3775,1,0,0,0,0,0,0,0,0,0,...,-1.089266,1.431295,-1.213162,2.998679,0.996224,-0.238729,1.037066,-0.674923,-0.240276,-0.271751
1731,0,0,0,0,0,1,0,0,0,0,...,-0.425979,-0.121567,-0.272123,0.824101,-0.161833,2.557665,-0.528097,0.835427,-0.240276,-0.271751
4634,0,0,0,0,0,0,0,0,0,0,...,-0.094336,-0.121567,-1.526842,0.461671,-0.740861,-1.357286,0.254484,0.080252,-0.240276,-0.271751
4533,0,1,0,0,0,0,0,0,0,0,...,-0.425979,0.654864,-1.213162,-0.625618,-0.161833,0.32055,-1.310678,-1.430098,-0.240276,-0.271751
3844,0,0,0,0,0,0,0,1,0,0,...,-1.420909,-0.509782,-1.213162,0.099241,0.70671,0.32055,1.037066,0.080252,-0.240276,3.398838
2139,0,0,0,0,0,0,0,0,1,0,...,0.568951,-0.509782,-0.899483,-0.625618,-0.740861,1.159468,-0.528097,0.080252,-0.240276,3.398838
1281,0,0,0,0,0,0,0,0,0,0,...,1.232238,-1.286213,0.668917,-0.263188,-0.451347,0.32055,-1.310678,0.835427,-0.240276,3.398838
4812,0,0,1,0,0,0,0,0,0,0,...,-1.089266,0.654864,-0.899483,1.18653,-1.030375,-0.518368,0.254484,-0.674923,-0.240276,-0.271751
5539,0,0,0,0,0,0,1,0,0,0,...,0.568951,-0.121567,-0.899483,0.461671,-0.740861,0.879829,1.819647,0.080252,-0.240276,-0.271751
2979,0,0,0,0,0,0,0,0,0,0,...,0.237308,-0.121567,0.668917,-0.263188,1.864767,-0.518368,3.38481,0.835427,-0.240276,-0.271751


In [17]:
pred = model.predict(X_test)
pred_df = pd.DataFrame(pred, columns=['Prediction'])

# Safer check for probability support
if hasattr(model, "predict_proba"):
    pred_prob = model.predict_proba(X_test)
    pred_prob_df = pd.DataFrame(pred_prob, columns=['Away Win %', 'Draw %', 'Home Win %'])
    pred_prob_df = pred_prob_df.round(6) * 100
    prediction_df = pd.concat([pred_df, pred_prob_df], axis=1)
else:
    prediction_df = pred_df

prediction_df.head()


Unnamed: 0,Prediction,Away Win %,Draw %,Home Win %
0,H,0.0,0.0,100.0
1,D,49.4591,50.4756,0.0653
2,H,0.0003,0.0004,99.9993
3,A,92.8084,7.1088,0.0828
4,D,10.8407,87.7911,1.3681


## Incorporate the result probabilities into the fixture

In [20]:
fixtures = pd.read_csv('../data/fixture/epl-2024-GMT.csv')

print(fixtures.shape[0], 'matches')
fixtures.head()

380 matches


Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result
0,1,1,16/08/2024 20:00,Old Trafford,Man Utd,Fulham,1 - 0
1,2,1,17/08/2024 12:30,Portman Road,Ipswich,Liverpool,0 - 2
2,3,1,17/08/2024 15:00,Emirates Stadium,Arsenal,Wolves,2 - 0
3,4,1,17/08/2024 15:00,Goodison Park,Everton,Brighton,0 - 3
4,5,1,17/08/2024 15:00,St. James' Park,Newcastle,Southampton,1 - 0


In [21]:
fixtures = fixtures.drop(['Match Number','Round Number', 'Date', 'Location', 'Result'], axis=1)

fixtures.columns = ['HomeTeam', 'AwayTeam']
fixtures['HTHG'] = 0
fixtures['HTAG'] = 0
fixtures['FTHG'] = 0
fixtures['FTAG'] = 0
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0

print(fixtures.shape, 'features')
fixtures.head()

(380, 18) features


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,Man Utd,Fulham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Ipswich,Liverpool,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Arsenal,Wolves,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Everton,Brighton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Newcastle,Southampton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
pp_fixtures = preprocess(fixtures)

In [42]:
pp_fixtures

Unnamed: 0,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brentford,HomeTeam_Brighton,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Fulham,HomeTeam_Ipswich,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
376,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
377,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
pp_fixtures = pp_fixtures.reindex(columns=X_train.columns, fill_value=0)

In [44]:
model.predict(pp_fixtures)

array(['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D

In [45]:
fixtures['Prediction'] = model.predict(pp_fixtures)

In [53]:
Result = fixtures.drop(['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS',
                        'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
                        'HY', 'AY', 'HR', 'AR'], axis=1)


In [54]:
Result.head()

Unnamed: 0,HomeTeam,AwayTeam,Prediction
0,Man Utd,Fulham,D
1,Ipswich,Liverpool,D
2,Arsenal,Wolves,D
3,Everton,Brighton,D
4,Newcastle,Southampton,D


In [48]:
if(model.probability):
    fixture_pred_prob = model.predict_proba(pp_fixtures) *100
    fixture_pred_prob = pd.DataFrame(fixture_pred_prob, columns=['Away win %','Draw %','Home win %'])
    display(fixture_pred_prob)

Unnamed: 0,Away win %,Draw %,Home win %
0,3.366449,68.000587,28.632965
1,3.356874,68.502790,28.140335
2,3.357459,68.274468,28.368073
3,3.366936,67.985293,28.647771
4,3.365883,68.185425,28.448692
...,...,...,...
375,3.384394,67.710773,28.904833
376,3.394986,68.363396,28.241618
377,3.386727,68.253966,28.359308
378,3.360221,68.268822,28.370957


In [55]:
final = pd.concat([Result, fixture_pred_prob], axis = 1)

In [56]:
final

Unnamed: 0,HomeTeam,AwayTeam,Prediction,Away win %,Draw %,Home win %
0,Man Utd,Fulham,D,3.366449,68.000587,28.632965
1,Ipswich,Liverpool,D,3.356874,68.502790,28.140335
2,Arsenal,Wolves,D,3.357459,68.274468,28.368073
3,Everton,Brighton,D,3.366936,67.985293,28.647771
4,Newcastle,Southampton,D,3.365883,68.185425,28.448692
...,...,...,...,...,...,...
375,Newcastle,Everton,D,3.384394,67.710773,28.904833
376,Nott'm Forest,Chelsea,D,3.394986,68.363396,28.241618
377,Southampton,Arsenal,D,3.386727,68.253966,28.359308
378,Spurs,Brighton,D,3.360221,68.268822,28.370957


In [57]:
readFixtures = pd.read_csv('../data/fixtures/epl-2024-GMT.csv')
exportToFixtures = final.drop(['HomeTeam','AwayTeam'],1)

PredictedResultWithFixtureData = pd.concat([readFixtures,exportToFixtures], axis = 1)
PredictedResultWithFixtureData.to_csv('../data/predictions/predictions.csv')

display(PredictedResultWithFixtureData.tail(25))

FileNotFoundError: [Errno 2] No such file or directory: '../data/fixtures/epl-2024-GMT.csv'