# 003.1 Season Cross Validation

League games match Structure

Match Days

Prediction Sets

Time series Cross Validation

In [1]:
import pathlib
import sys

from typing import List, Tuple, Union, Callable
from types import ModuleType

from statsmodels.graphics.mosaicplot import mosaic
import scipy.stats as stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
import seaborn as sns

%matplotlib inline

sys.path.append('..')
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

from src.data.utils import make_interim_data, get_clean_season, get_interim_data_fps
from src.model_selection.resampling import league_split, calc_pred_set_ids

### Load Test Data

In [2]:
# Get the file path list from the interim data dir
fl = get_interim_data_fps()
# Extract a single season from the interim data
season = get_clean_season(fl[0], '2012-2013')
season.sort_values(by='date')
season.head()

Unnamed: 0,date,season,h,a,h_goals,a_goals,h_shots,a_shots,h_sot,a_sot,hwin_odds,draw_odds,awin_odds,result,h_phwinodds,a_pdrawodds,a_pawinodds
0,2012-08-18,2012-2013,Arsenal,Sunderland,0,0,14,3,4,2,1.44,4.72,8.71,draw,0.680082,0.207482,0.112436
1,2012-08-18,2012-2013,Fulham,Norwich,5,0,11,4,9,2,1.84,3.75,4.75,hwin,0.532471,0.261266,0.206263
2,2012-08-18,2012-2013,Newcastle,Tottenham,2,1,6,12,4,6,2.83,3.35,2.72,hwin,0.346594,0.292795,0.360611
3,2012-08-18,2012-2013,QPR,Swansea,0,5,20,12,11,8,2.0,3.53,4.15,awin,0.488162,0.276579,0.235259
4,2012-08-18,2012-2013,Reading,Stoke,1,1,9,6,3,3,2.47,3.3,3.22,draw,0.397525,0.297541,0.304934


In [3]:
season.shape

(380, 17)

## Time Series Cross Validation Notes

There are 20 teams in the premier league. Each team gets visited by 19 other teams, meaning that a team plays 19 home games per seasons. So 20 teams x 19 games x 2 (home and away) = 760 individual team performances, but there are 2 teams per game, so this means 380 games are played in the season.

So for example, Man U. will play every other team at home and every other team away. I think the fixture schedulers do their best to try and alternate home and away games.

So, if the games are denoted by home team vs away team, then every game is unique. I don't think this is true of the US Major League Soccer. I think there may be repeat games within an MLS season.

When we are considering a cross validation scheme for a league with fixtures structured like the EPL, certain key points are relevant:
+ If we are using a model based on past game results, then we would need to see at least 1, and more likely 2 home and away games played by each team before we start predicting. However, if we are using previous season, and odds data, then this does not apply
+ There are natural `prediction sets`. These are where each team has played x home games, and y away games. When we back test we can identify these prediction sets with an integer, and use this as a forward moving cross validation scheme
+ It is probably better to code something up that we can plug into a sci-kit learn flow
+ [Rob J Hyndman Cross-validation for time series](https://robjhyndman.com/hyndsight/tscv/) - Good explanation, and the following graphic says it far more clearly than text. Just consider each dot to be a prediction set - i.e. the next game that each of the 20 league teams plays

![Time Series Cross-validation](images/cv1-1.png "Time Series Cross-validation")


In [4]:
season_df = season.copy(deep=True)
season_df_mod = calc_pred_set_ids(season_df)
season_df_mod.head(20).sort_values(by='date')

Unnamed: 0,date,season,h,a,h_goals,a_goals,h_shots,a_shots,h_sot,a_sot,hwin_odds,draw_odds,awin_odds,result,h_phwinodds,a_pdrawodds,a_pawinodds,pred_set
0,2012-08-18,2012-2013,Arsenal,Sunderland,0,0,14,3,4,2,1.44,4.72,8.71,draw,0.680082,0.207482,0.112436,0
1,2012-08-18,2012-2013,Fulham,Norwich,5,0,11,4,9,2,1.84,3.75,4.75,hwin,0.532471,0.261266,0.206263,0
2,2012-08-18,2012-2013,Newcastle,Tottenham,2,1,6,12,4,6,2.83,3.35,2.72,hwin,0.346594,0.292795,0.360611,0
3,2012-08-18,2012-2013,QPR,Swansea,0,5,20,12,11,8,2.0,3.53,4.15,awin,0.488162,0.276579,0.235259,0
4,2012-08-18,2012-2013,Reading,Stoke,1,1,9,6,3,3,2.47,3.3,3.22,draw,0.397525,0.297541,0.304934,0
5,2012-08-18,2012-2013,West Brom,Liverpool,3,0,15,14,10,7,4.76,3.74,1.84,hwin,0.205775,0.261895,0.53233,0
6,2012-08-18,2012-2013,West Ham,Aston Villa,1,0,8,10,4,6,2.14,3.5,3.71,hwin,0.456987,0.279415,0.263599,0
7,2012-08-19,2012-2013,Man City,Southampton,3,2,20,9,15,6,1.13,9.38,26.18,hwin,0.859378,0.103529,0.037093,0
8,2012-08-19,2012-2013,Wigan,Chelsea,0,2,12,5,4,3,4.92,3.77,1.81,awin,0.199073,0.259799,0.541128,0
9,2012-08-20,2012-2013,Everton,Man United,1,0,16,12,7,7,3.91,3.59,2.06,hwin,0.250803,0.273159,0.476039,0


In [5]:
# Get all games where any team is playing their zeroth home game
pset = 0
pred_set = season_df_mod[season_df_mod['pred_set'] == pset]
pred_set

Unnamed: 0,date,season,h,a,h_goals,a_goals,h_shots,a_shots,h_sot,a_sot,hwin_odds,draw_odds,awin_odds,result,h_phwinodds,a_pdrawodds,a_pawinodds,pred_set
0,2012-08-18,2012-2013,Arsenal,Sunderland,0,0,14,3,4,2,1.44,4.72,8.71,draw,0.680082,0.207482,0.112436,0
1,2012-08-18,2012-2013,Fulham,Norwich,5,0,11,4,9,2,1.84,3.75,4.75,hwin,0.532471,0.261266,0.206263,0
2,2012-08-18,2012-2013,Newcastle,Tottenham,2,1,6,12,4,6,2.83,3.35,2.72,hwin,0.346594,0.292795,0.360611,0
3,2012-08-18,2012-2013,QPR,Swansea,0,5,20,12,11,8,2.0,3.53,4.15,awin,0.488162,0.276579,0.235259,0
4,2012-08-18,2012-2013,Reading,Stoke,1,1,9,6,3,3,2.47,3.3,3.22,draw,0.397525,0.297541,0.304934,0
5,2012-08-18,2012-2013,West Brom,Liverpool,3,0,15,14,10,7,4.76,3.74,1.84,hwin,0.205775,0.261895,0.53233,0
6,2012-08-18,2012-2013,West Ham,Aston Villa,1,0,8,10,4,6,2.14,3.5,3.71,hwin,0.456987,0.279415,0.263599,0
7,2012-08-19,2012-2013,Man City,Southampton,3,2,20,9,15,6,1.13,9.38,26.18,hwin,0.859378,0.103529,0.037093,0
8,2012-08-19,2012-2013,Wigan,Chelsea,0,2,12,5,4,3,4.92,3.77,1.81,awin,0.199073,0.259799,0.541128,0
9,2012-08-20,2012-2013,Everton,Man United,1,0,16,12,7,7,3.91,3.59,2.06,hwin,0.250803,0.273159,0.476039,0


In [6]:
# Get all games where any team is playing their zeroth home game
pset = 20
pred_set = season_df_mod[season_df_mod['pred_set'] == pset]
pred_set

Unnamed: 0,date,season,h,a,h_goals,a_goals,h_shots,a_shots,h_sot,a_sot,hwin_odds,draw_odds,awin_odds,result,h_phwinodds,a_pdrawodds,a_pawinodds,pred_set
198,2013-01-01,2012-2013,Wigan,Man United,0,4,7,14,3,9,4.88,3.9,1.79,awin,0.200902,0.251386,0.547712,20
200,2013-01-01,2012-2013,West Brom,Fulham,1,2,17,10,10,5,2.0,3.62,4.05,awin,0.488684,0.269991,0.241325,20
201,2013-01-01,2012-2013,Tottenham,Reading,3,1,30,7,20,4,1.34,5.58,10.47,hwin,0.730926,0.175527,0.093547,20
203,2013-01-01,2012-2013,Man City,Stoke,3,0,17,4,9,3,1.33,5.17,12.82,hwin,0.734755,0.189018,0.076227,20
204,2013-01-01,2012-2013,Swansea,Aston Villa,2,2,23,8,13,6,1.62,4.09,5.99,draw,0.600046,0.237671,0.162283,20
205,2013-01-02,2012-2013,Newcastle,Everton,1,2,14,7,11,6,3.82,3.46,2.13,awin,0.256576,0.283272,0.460151,20
207,2013-01-02,2012-2013,Liverpool,Sunderland,3,0,27,6,16,5,1.4,5.09,9.17,hwin,0.700417,0.192649,0.106934,20
209,2013-01-12,2012-2013,Stoke,Chelsea,0,4,11,11,5,9,5.16,3.57,1.83,awin,0.189932,0.274523,0.535545,20
210,2013-01-12,2012-2013,Sunderland,West Ham,3,0,18,9,14,4,2.68,3.28,2.92,hwin,0.365647,0.29876,0.335593,20
214,2013-01-12,2012-2013,Aston Villa,Southampton,0,1,16,11,10,4,2.66,3.39,2.84,awin,0.367474,0.288342,0.344183,20


In [7]:
pset = 37
pred_set = season_df_mod[season_df_mod['pred_set'] == pset]
pred_set

Unnamed: 0,date,season,h,a,h_goals,a_goals,h_shots,a_shots,h_sot,a_sot,hwin_odds,draw_odds,awin_odds,result,h_phwinodds,a_pdrawodds,a_pawinodds,pred_set
370,2013-05-19,2012-2013,Chelsea,Everton,2,1,20,16,10,9,1.78,3.9,4.95,hwin,0.550659,0.251326,0.198015,37
371,2013-05-19,2012-2013,Wigan,Aston Villa,2,2,12,5,6,2,2.1,3.56,3.77,draw,0.465784,0.27476,0.259455,37
372,2013-05-19,2012-2013,West Ham,Reading,4,2,21,17,12,7,1.59,4.36,6.13,hwin,0.615741,0.224548,0.159711,37
373,2013-05-19,2012-2013,West Brom,Man United,5,5,15,12,8,8,3.65,3.42,2.17,draw,0.266718,0.284655,0.448627,37
374,2013-05-19,2012-2013,Tottenham,Sunderland,1,0,23,6,19,4,1.25,6.6,15.0,hwin,0.785714,0.14881,0.065476,37
375,2013-05-19,2012-2013,Swansea,Fulham,0,3,19,8,11,6,1.66,4.09,5.76,awin,0.590297,0.239583,0.17012,37
376,2013-05-19,2012-2013,Newcastle,Arsenal,0,1,7,9,2,3,6.26,4.63,1.55,awin,0.156476,0.211563,0.631961,37
377,2013-05-19,2012-2013,Man City,Norwich,2,3,12,12,5,7,1.29,5.99,12.91,awin,0.760294,0.163736,0.07597,37
378,2013-05-19,2012-2013,Liverpool,QPR,1,0,27,10,20,3,1.25,6.8,13.98,hwin,0.7854,0.144375,0.070225,37
379,2013-05-19,2012-2013,Southampton,Stoke,1,1,11,7,7,3,1.7,3.74,5.96,draw,0.574785,0.261266,0.163949,37


In [8]:
for train, val in league_split(season_df):
    print(f'{train} len: {len(train)}\n{val} len: {len(val)}\n')

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 24 20 21 22
 23 25 26 27 28 36 37 29 30 31 32 33 34 35 38 39 40] len: 41
[41 42 43 44 45 46 47 48 50 51] len: 10

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 24 20 21 22
 23 25 26 27 28 36 37 29 30 31 32 33 34 35 38 39 40 41 42 43 44 45 46 47
 48 50 51] len: 51
[49 52 53 54 55 56 57 58 61 62] len: 10

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 24 20 21 22
 23 25 26 27 28 36 37 29 30 31 32 33 34 35 38 39 40 41 42 43 44 45 46 47
 48 50 51 49 52 53 54 55 56 57 58 61 62] len: 61
[59 60 63 64 65 66 67 68 73 77] len: 10

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 24 20 21 22
 23 25 26 27 28 36 37 29 30 31 32 33 34 35 38 39 40 41 42 43 44 45 46 47
 48 50 51 49 52 53 54 55 56 57 58 61 62 59 60 63 64 65 66 67 68 73 77] len: 71
[69 70 71 72 74 75 76 78 79 80] len: 10

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 24 20 21 22
 23 25 26 27 28 36 37 29 30 31 32 33 34 35 3