# Football match prediction

Part of the code use in this notebook is this [kaggle kernel](https://www.kaggle.com/airback/match-outcome-prediction-in-football).

In [1]:
import os,sys
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,
                             recall_score, mean_squared_error, mean_absolute_error, r2_score, classification_report)
from sklearn.calibration import CalibratedClassifierCV
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from time import time
from sklearn.decomposition import PCA, FastICA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from libs.loaders import load_football
from libs.football import get_fifa_data, create_feables
import pkg_resources

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

System version: 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]




In [2]:
%%time
countries, matches, leagues, teams, players = load_football()
print(countries.shape)
print(matches.shape)
print(leagues.shape)
print(teams.shape)
print(players.shape)

MOUNT_POINT not found in environment. Defaulting to /fileshare
(11, 2)
(25979, 115)
(11, 3)
(299, 5)
(183978, 42)
CPU times: user 3.91 s, sys: 412 ms, total: 4.32 s
Wall time: 4.34 s


In [3]:
leagues

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
5,13274,13274,Netherlands Eredivisie
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA


In [4]:
matches.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [68]:
matches.tail()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
25974,25975,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,...,,,,,,,,,,
25975,25976,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,...,,,,,,,,,,
25976,25977,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,...,,,,,,,,,,
25977,25978,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,...,,,,,,,,,,
25978,25979,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992095,10192,9931,4,...,,,,,,,,,,


In [6]:
#Reduce match data to fulfill run time requirements
rows = ["country_id", "league_id", "season", "stage", "date", "match_api_id", "home_team_api_id", 
        "away_team_api_id", "home_team_goal", "away_team_goal", "home_player_1", "home_player_2",
        "home_player_3", "home_player_4", "home_player_5", "home_player_6", "home_player_7", 
        "home_player_8", "home_player_9", "home_player_10", "home_player_11", "away_player_1",
        "away_player_2", "away_player_3", "away_player_4", "away_player_5", "away_player_6",
        "away_player_7", "away_player_8", "away_player_9", "away_player_10", "away_player_11"]
match_data = matches.dropna(subset = rows)
print(match_data.shape)
#match_data = match_data.tail(1500)

(21374, 115)


In [7]:
%%time
fifa_data = get_fifa_data(match_data, players)
print(fifa_data.shape)
fifa_data.head()

(21374, 23)
CPU times: user 33min 8s, sys: 4.06 s, total: 33min 12s
Wall time: 32min 42s


In [9]:
%%time
bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']
bk_cols_selected = ['B365', 'BW']      
feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)
print(feables.shape)


Generating match features...
Match features generated in 13.0 minutes
Generating match labels...
Match labels generated in 1.2 minutes
Generating bookkeeper data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Bookkeeper data generated in 0.0 minutes
(19673, 47)
CPU times: user 14min 27s, sys: 5.35 s, total: 14min 32s
Wall time: 14min 14s


In [10]:
feables.head()

Unnamed: 0,match_api_id,home_team_goals_difference,away_team_goals_difference,games_won_home_team,games_won_away_team,games_against_won,games_against_lost,League_1.0,League_1729.0,League_4769.0,...,away_player_9_overall_rating,away_player_10_overall_rating,away_player_11_overall_rating,B365_Win,B365_Draw,B365_Defeat,BW_Win,BW_Draw,BW_Defeat,label
0,493017.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,70.0,68.0,63.0,0.313804,0.276886,0.40931,0.307825,0.27941,0.412765,Win
1,493025.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,67.0,73.0,68.0,0.327179,0.286281,0.38654,0.290493,0.300176,0.409331,Defeat
2,493027.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,55.0,58.0,64.0,0.672897,0.209346,0.117757,0.672269,0.226891,0.10084,Win
3,493034.0,1.0,2.0,1.0,1.0,0.0,0.0,1,0,0,...,74.0,70.0,69.0,0.207407,0.259259,0.533333,0.192717,0.274476,0.532807,Win
4,493040.0,-2.0,0.0,0.0,0.0,0.0,0.0,1,0,0,...,60.0,63.0,65.0,0.535211,0.267606,0.197183,0.565759,0.25499,0.17925,Draw


In [38]:
features = feables[feables.columns.difference(['match_api_id', 'label'])]
labs = feables['label']
print(features.shape)
print(labs.shape)

(19673, 45)
(19673,)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(features, labs, test_size=0.2, random_state=42, stratify=labs)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(X_train.shape[0] + X_val.shape[0] + X_test.shape[0])

(12590, 45)
(3148, 45)
(3935, 45)
19673


In [59]:
clf = XGBClassifier(max_depth=8, 
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [60]:
%%time
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='merror')

[0]	validation_0-merror:0.492058
[1]	validation_0-merror:0.487611
[2]	validation_0-merror:0.484435
[3]	validation_0-merror:0.484435
[4]	validation_0-merror:0.481576
[5]	validation_0-merror:0.478399
[6]	validation_0-merror:0.478399
[7]	validation_0-merror:0.48094
[8]	validation_0-merror:0.481576
[9]	validation_0-merror:0.484117
[10]	validation_0-merror:0.484752
[11]	validation_0-merror:0.483164
[12]	validation_0-merror:0.482529
[13]	validation_0-merror:0.482529
[14]	validation_0-merror:0.483164
[15]	validation_0-merror:0.483799
[16]	validation_0-merror:0.482529
[17]	validation_0-merror:0.481576
[18]	validation_0-merror:0.482211
[19]	validation_0-merror:0.480623
[20]	validation_0-merror:0.483482
[21]	validation_0-merror:0.484435
[22]	validation_0-merror:0.485388
[23]	validation_0-merror:0.483799
[24]	validation_0-merror:0.483799
[25]	validation_0-merror:0.483482
[26]	validation_0-merror:0.481258
[27]	validation_0-merror:0.481258
[28]	validation_0-merror:0.481576
[29]	validation_0-merror:

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=1)

In [61]:
y_pred = clf.predict(X_test)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test, y_pred)))

Score of XGBClassifier for training set: 0.7590.
Score of XGBClassifier for test set: 0.5225.


In [62]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_multilabel(y_true, y_pred, labels):
    m_acc = accuracy_score(y_true, y_pred)
    m_f1 = f1_score(y_true, y_pred, labels, average='weighted')
    m_precision = precision_score(y_true, y_pred, labels, average='weighted')
    m_recall = recall_score(y_true, y_pred, labels, average='weighted')
    m_conf = confusion_matrix(y_true, y_pred, labels)
    report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1, 'Confusion Matrix':m_conf}
    return report

In [63]:
labels = ["Win", "Draw", "Defeat"]
report = classification_metrics_multilabel(y_test, y_pred, labels)
report

{'Accuracy': 0.52249047013977123,
 'Confusion Matrix': array([[1452,   88,  267],
        [ 650,   65,  278],
        [ 532,   64,  539]]),
 'F1': 0.46751804585279649,
 'Precision': 0.472151244629816,
 'Recall': 0.52249047013977123}

In [64]:
clf = LGBMClassifier(num_leaves=255,
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [65]:
%%time
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')

[1]	valid_0's multi_error: 0.520013
[2]	valid_0's multi_error: 0.524778
[3]	valid_0's multi_error: 0.513342
[4]	valid_0's multi_error: 0.511118
[5]	valid_0's multi_error: 0.5054
[6]	valid_0's multi_error: 0.50413
[7]	valid_0's multi_error: 0.499682
[8]	valid_0's multi_error: 0.506353
[9]	valid_0's multi_error: 0.505083
[10]	valid_0's multi_error: 0.506671
[11]	valid_0's multi_error: 0.506036
[12]	valid_0's multi_error: 0.499365
[13]	valid_0's multi_error: 0.499682
[14]	valid_0's multi_error: 0.501271
[15]	valid_0's multi_error: 0.496506
[16]	valid_0's multi_error: 0.499047
[17]	valid_0's multi_error: 0.498729
[18]	valid_0's multi_error: 0.499682
[19]	valid_0's multi_error: 0.499682
[20]	valid_0's multi_error: 0.498094
[21]	valid_0's multi_error: 0.498094
[22]	valid_0's multi_error: 0.499682
[23]	valid_0's multi_error: 0.497141
[24]	valid_0's multi_error: 0.498412
[25]	valid_0's multi_error: 0.499365
[26]	valid_0's multi_error: 0.503494
[27]	valid_0's multi_error: 0.502541
[28]	valid_0'

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=2, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [66]:
y_pred = clf.predict(X_test)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test, y_pred)))

Score of LGBMClassifier for training set: 0.9998.
Score of LGBMClassifier for test set: 0.5116.


In [67]:
report = classification_metrics_multilabel(y_test, y_pred, labels)
report

{'Accuracy': 0.51156289707750957,
 'Confusion Matrix': array([[1387,  147,  273],
        [ 623,  109,  261],
        [ 503,  115,  517]]),
 'F1': 0.47163912122565715,
 'Precision': 0.4694795495800122,
 'Recall': 0.51156289707750957}

# 2) Concept drift

In [None]:
la_liga_id = 21518
matches_target = matches[matches['league_id'] == la_liga_id]
print(matches_target.shape)
print(matches_target.head(5))
print(matches_target.tail(5))

In [None]:
cols = ["country_id", "league_id", "season", "stage", "date", "match_api_id", "home_team_api_id", 
        "away_team_api_id", "home_team_goal", "away_team_goal", "home_player_1", "home_player_2",
        "home_player_3", "home_player_4", "home_player_5", "home_player_6", "home_player_7", 
        "home_player_8", "home_player_9", "home_player_10", "home_player_11", "away_player_1",
        "away_player_2", "away_player_3", "away_player_4", "away_player_5", "away_player_6",
        "away_player_7", "away_player_8", "away_player_9", "away_player_10", "away_player_11"]
match_data = matches_target.dropna(subset = cols)
print(match_data.shape)

In [None]:
%%time
fifa_data = get_fifa_data(match_data, players)


In [None]:
pd.set_option('display.max_columns', None) #show all columns in pandas

In [None]:
print(fifa_data.shape)
fifa_data.head()

In [None]:
%%time
bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']
bk_cols_selected = ['B365', 'BW']      
feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)
print(feables.shape)
feables.head()

In [None]:
feables_2015_2016 = feables[feables['season'] == '2015/2016']
print(feables_2015_2016.shape)
feables_2014_2015 = feables[feables['season'] == '2014/2015']
print(feables_2014_2015.shape)
feables_rest = feables[(feables['season'] != '2014/2015') & (feables['season'] != '2015/2016')]
print(feables_rest.shape)

In [None]:
X_train = feables_rest[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]
y_train = feables_rest['label']
X_test1 = feables_2014_2015[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]
y_test1 = feables_2014_2015['label']
X_test2 = feables_2015_2016[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]
y_test2 = feables_2015_2016['label']

print(X_train.shape)
print(X_test1.shape)
print(X_test2.shape)


In [None]:
feables_up_to_2014_2015 = feables[feables['season'] != '2015/2016']
print(feables_up_to_2014_2015.shape)
X_train2 = feables_up_to_2014_2015[feables_up_to_2014_2015.columns.difference(['match_api_id', 'label', 'season'])]
y_train2 = feables_up_to_2014_2015['label']

In [None]:
clf = XGBClassifier(max_depth=8, 
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [None]:
%%time
clf.fit(X_train, y_train, verbose=True, eval_metric='merror')

In [None]:
y_pred1 = clf.predict(X_test1)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))

In [None]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))

In [None]:
%%time
clf.fit(X_train2, y_train2, verbose=True, eval_metric='merror')

In [None]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))

In [None]:
clf = LGBMClassifier(num_leaves=255,
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [None]:
%%time
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')

In [None]:
y_pred1 = clf.predict(X_test1)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))

In [None]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))

In [None]:
%%time
clf.fit(X_train2, y_train2, verbose=True, eval_metric='merror')

In [None]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))