# 003.2 - Formating Into Historical Data (& Custom Transformers)

In [1]:
import pathlib
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

sys.path.append('..')
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

from src.data.utils import make_interim_data, get_clean_season, get_interim_data_fps
from src.data.historical_records import form_historical_records
from src.features.custom_transformers import LastNGames, Drop0Columns, IdentityTransformer

## Notes

DataFrame in time sequence where index 0 is the earliest game, and index max is the latest game

There are 2 teams per game `h` and `a`, where h indicates the team played at home, and a - away.

Each team has generated a feature by the end of the game - `h_feat`, and `a_feat`

There could be multiple features, meaning a double  set for each feature

Each team has a home game record and an away game record

So, there are 4 records to get for each feature
+ home team home record `h_h`
+ home team away record `h_a`
+ away team home record `a_h`
+ away team away record `a_a`

Results presented as 0, -1, -2 ... -n where 0 referes to the current game. This column can be used as a predictor variable for classification, or used as a classification response, or dropped and just the previous game features used for prediction


### Interpretation Note

`h_h_feat_-1` means:
+ (`h`) home team 
+ (`h`) home game records 
+ (`feat`) feature value in 
+ (`-1`) last game

### Test Data

In [2]:
df = pd.DataFrame({'h':['A','C','B','E','A','B','L','M','B'],
                  'a':['B','D','A','F','J','K','B','A','S'],
                  'h_firstfeat':[0,1,2,3,4,5,6,7,8],
                  'a_firstfeat':[9,10,11,12,13,14,15,16,17],
                  'h_secondfeat':[18,19,20,21,22,23,24,25,26],
                  'a_secondfeat':[27,28,29,30,31,32,33,34,35]})
df

Unnamed: 0,h,a,h_firstfeat,a_firstfeat,h_secondfeat,a_secondfeat
0,A,B,0,9,18,27
1,C,D,1,10,19,28
2,B,A,2,11,20,29
3,E,F,3,12,21,30
4,A,J,4,13,22,31
5,B,K,5,14,23,32
6,L,B,6,15,24,33
7,M,A,7,16,25,34
8,B,S,8,17,26,35


In [3]:
df = df.copy(deep=True)
print(df)
records_df = form_historical_records(df)
print(records_df.columns)
records_df



   h  a  h_firstfeat  a_firstfeat  h_secondfeat  a_secondfeat
0  A  B            0            9            18            27
1  C  D            1           10            19            28
2  B  A            2           11            20            29
3  E  F            3           12            21            30
4  A  J            4           13            22            31
5  B  K            5           14            23            32
6  L  B            6           15            24            33
7  M  A            7           16            25            34
8  B  S            8           17            26            35
Index(['h', 'h_h_firstfeat_-1', 'h_h_firstfeat_-2', 'h_h_firstfeat_0',
       'h_h_secondfeat_-1', 'h_h_secondfeat_-2', 'h_h_secondfeat_0',
       'h_a_firstfeat_-1', 'h_a_firstfeat_-2', 'h_a_firstfeat_0',
       'h_a_secondfeat_-1', 'h_a_secondfeat_-2', 'h_a_secondfeat_0', 'a',
       'a_h_firstfeat_-1', 'a_h_firstfeat_0', 'a_h_secondfeat_-1',
       'a_h_secondfeat_0', 'a_a_f

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  result = concat(values, axis=self.axis)


Unnamed: 0,h,h_h_firstfeat_-1,h_h_firstfeat_-2,h_h_firstfeat_0,h_h_secondfeat_-1,h_h_secondfeat_-2,h_h_secondfeat_0,h_a_firstfeat_-1,h_a_firstfeat_-2,h_a_firstfeat_0,...,h_a_secondfeat_0,a,a_h_firstfeat_-1,a_h_firstfeat_0,a_h_secondfeat_-1,a_h_secondfeat_0,a_a_firstfeat_-1,a_a_firstfeat_0,a_a_secondfeat_-1,a_a_secondfeat_0
0,A,,,0,,,18,,,9,...,27,B,,0,,18,,9,,27
1,C,,,1,,,19,,,10,...,28,D,,1,,19,,10,,28
2,B,,,2,,,20,,,11,...,29,A,,2,,20,,11,,29
3,E,,,3,,,21,,,12,...,30,F,,3,,21,,12,,30
4,A,0.0,,4,18.0,,22,9.0,,13,...,31,J,,4,,22,,13,,31
5,B,2.0,,5,20.0,,23,11.0,,14,...,32,K,,5,,23,,14,,32
6,L,,,6,,,24,,,15,...,33,B,0.0,6,18.0,24,9.0,15,27.0,33
7,M,,,7,,,25,,,16,...,34,A,2.0,7,20.0,25,11.0,16,29.0,34
8,B,5.0,2.0,8,23.0,20.0,26,14.0,11.0,17,...,35,S,,8,,26,,17,,35


## League Data

In [4]:
# The file name contains the seson range 19XX-20YY.csv
# Extract a single season from the interim data
from src.data.utils import get_clean_season, get_interim_data_fps

file_path_list = get_interim_data_fps()
season = get_clean_season(file_path_list[0], '2012-2013')
season.sort_values(by='date')
season.head()

Unnamed: 0,date,season,h,a,h_goals,a_goals,h_shots,a_shots,h_sot,a_sot,hwin_odds,draw_odds,awin_odds,result,h_phwinodds,a_pdrawodds,a_pawinodds
0,2012-08-18,2012-2013,Arsenal,Sunderland,0,0,14,3,4,2,1.44,4.72,8.71,draw,0.680082,0.207482,0.112436
1,2012-08-18,2012-2013,Fulham,Norwich,5,0,11,4,9,2,1.84,3.75,4.75,hwin,0.532471,0.261266,0.206263
2,2012-08-18,2012-2013,Newcastle,Tottenham,2,1,6,12,4,6,2.83,3.35,2.72,hwin,0.346594,0.292795,0.360611
3,2012-08-18,2012-2013,QPR,Swansea,0,5,20,12,11,8,2.0,3.53,4.15,awin,0.488162,0.276579,0.235259
4,2012-08-18,2012-2013,Reading,Stoke,1,1,9,6,3,3,2.47,3.3,3.22,draw,0.397525,0.297541,0.304934


In [5]:
keeper_cols = ['h', 'a', 'h_goals', 'a_goals', 'h_shots', 'a_shots', 'h_sot', 'a_sot']
season = season[keeper_cols]
season.head()

Unnamed: 0,h,a,h_goals,a_goals,h_shots,a_shots,h_sot,a_sot
0,Arsenal,Sunderland,0,0,14,3,4,2
1,Fulham,Norwich,5,0,11,4,9,2
2,Newcastle,Tottenham,2,1,6,12,4,6
3,QPR,Swansea,0,5,20,12,11,8
4,Reading,Stoke,1,1,9,6,3,3


In [6]:
records_df = form_historical_records(season)
print(list(records_df.columns))

['h', 'h_h_goals_0', 'h_h_goals_-1', 'h_h_goals_-2', 'h_h_goals_-3', 'h_h_goals_-4', 'h_h_goals_-5', 'h_h_goals_-6', 'h_h_goals_-7', 'h_h_goals_-8', 'h_h_goals_-9', 'h_h_goals_-10', 'h_h_goals_-11', 'h_h_goals_-12', 'h_h_goals_-13', 'h_h_goals_-14', 'h_h_goals_-15', 'h_h_goals_-16', 'h_h_goals_-17', 'h_h_goals_-18', 'h_h_shots_0', 'h_h_shots_-1', 'h_h_shots_-2', 'h_h_shots_-3', 'h_h_shots_-4', 'h_h_shots_-5', 'h_h_shots_-6', 'h_h_shots_-7', 'h_h_shots_-8', 'h_h_shots_-9', 'h_h_shots_-10', 'h_h_shots_-11', 'h_h_shots_-12', 'h_h_shots_-13', 'h_h_shots_-14', 'h_h_shots_-15', 'h_h_shots_-16', 'h_h_shots_-17', 'h_h_shots_-18', 'h_h_sot_0', 'h_h_sot_-1', 'h_h_sot_-2', 'h_h_sot_-3', 'h_h_sot_-4', 'h_h_sot_-5', 'h_h_sot_-6', 'h_h_sot_-7', 'h_h_sot_-8', 'h_h_sot_-9', 'h_h_sot_-10', 'h_h_sot_-11', 'h_h_sot_-12', 'h_h_sot_-13', 'h_h_sot_-14', 'h_h_sot_-15', 'h_h_sot_-16', 'h_h_sot_-17', 'h_h_sot_-18', 'h_a_goals_0', 'h_a_goals_-1', 'h_a_goals_-2', 'h_a_goals_-3', 'h_a_goals_-4', 'h_a_goals_-5', '

In [7]:
records_df.head(40)

Unnamed: 0,h,h_h_goals_0,h_h_goals_-1,h_h_goals_-2,h_h_goals_-3,h_h_goals_-4,h_h_goals_-5,h_h_goals_-6,h_h_goals_-7,h_h_goals_-8,...,a_a_sot_-9,a_a_sot_-10,a_a_sot_-11,a_a_sot_-12,a_a_sot_-13,a_a_sot_-14,a_a_sot_-15,a_a_sot_-16,a_a_sot_-17,a_a_sot_-18
0,Arsenal,0,,,,,,,,,...,,,,,,,,,,
1,Fulham,5,,,,,,,,,...,,,,,,,,,,
2,Newcastle,2,,,,,,,,,...,,,,,,,,,,
3,QPR,0,,,,,,,,,...,,,,,,,,,,
4,Reading,1,,,,,,,,,...,,,,,,,,,,
5,West Brom,3,,,,,,,,,...,,,,,,,,,,
6,West Ham,1,,,,,,,,,...,,,,,,,,,,
7,Man City,3,,,,,,,,,...,,,,,,,,,,
8,Wigan,0,,,,,,,,,...,,,,,,,,,,
9,Everton,1,,,,,,,,,...,,,,,,,,,,


In [8]:
records_df.tail(40)

Unnamed: 0,h,h_h_goals_0,h_h_goals_-1,h_h_goals_-2,h_h_goals_-3,h_h_goals_-4,h_h_goals_-5,h_h_goals_-6,h_h_goals_-7,h_h_goals_-8,...,a_a_sot_-9,a_a_sot_-10,a_a_sot_-11,a_a_sot_-12,a_a_sot_-13,a_a_sot_-14,a_a_sot_-15,a_a_sot_-16,a_a_sot_-17,a_a_sot_-18
340,Man City,2,1.0,4.0,2.0,2.0,2.0,3.0,1.0,2.0,...,3.0,2.0,3.0,6.0,5.0,11.0,10.0,4.0,6.0,
341,Everton,1,2.0,1.0,2.0,3.0,3.0,2.0,0.0,1.0,...,3.0,7.0,5.0,5.0,8.0,5.0,9.0,13.0,11.0,
342,Newcastle,0,0.0,1.0,2.0,4.0,3.0,1.0,1.0,1.0,...,6.0,8.0,9.0,14.0,5.0,5.0,6.0,9.0,7.0,
343,Arsenal,1,0.0,3.0,4.0,2.0,1.0,2.0,5.0,0.0,...,3.0,5.0,9.0,7.0,7.0,9.0,4.0,9.0,7.0,
344,Chelsea,2,2.0,2.0,1.0,4.0,2.0,2.0,0.0,8.0,...,0.0,10.0,9.0,3.0,5.0,0.0,5.0,8.0,,
345,Reading,0,0.0,0.0,1.0,0.0,2.0,2.0,3.0,1.0,...,4.0,9.0,6.0,8.0,3.0,6.0,6.0,5.0,4.0,
346,Aston Villa,6,1.0,1.0,3.0,0.0,2.0,1.0,0.0,0.0,...,10.0,6.0,9.0,4.0,3.0,6.0,2.0,3.0,2.0,
347,West Ham,0,2.0,2.0,3.0,2.0,1.0,1.0,2.0,1.0,...,11.0,13.0,6.0,3.0,3.0,7.0,3.0,7.0,5.0,
348,West Brom,2,1.0,1.0,2.0,2.0,0.0,2.0,1.0,2.0,...,6.0,6.0,4.0,3.0,4.0,7.0,3.0,4.0,8.0,
349,Swansea,0,0.0,1.0,0.0,1.0,4.0,3.0,2.0,1.0,...,12.0,10.0,6.0,7.0,8.0,13.0,12.0,11.0,5.0,


## Transformers

### Get last n Games

In [9]:
last_n_games = LastNGames(3)
last_n_games.fit(records_df)
cut_df = last_n_games.transform(records_df)
cut_df.tail(20)

Unnamed: 0,h,a,h_h_goals_0,h_h_goals_-1,h_h_goals_-2,h_h_shots_0,h_h_shots_-1,h_h_shots_-2,h_h_sot_0,h_h_sot_-1,...,a_h_sot_-2,a_a_goals_0,a_a_goals_-1,a_a_goals_-2,a_a_shots_0,a_a_shots_-1,a_a_shots_-2,a_a_sot_0,a_a_sot_-1,a_a_sot_-2
360,Aston Villa,Chelsea,1,6.0,1.0,12,18.0,15.0,7,15.0,...,12.0,2,1.0,2.0,11,15.0,9.0,8,6.0,6.0
361,Sunderland,Southampton,1,1.0,1.0,5,14.0,8.0,5,7.0,...,7.0,1,0.0,0.0,15,10.0,9.0,10,6.0,5.0
362,QPR,Newcastle,1,0.0,0.0,8,14.0,25.0,3,6.0,...,4.0,2,0.0,1.0,10,11.0,10.0,7,6.0,6.0
363,Norwich,West Brom,4,1.0,2.0,11,9.0,19.0,8,4.0,...,4.0,0,0.0,3.0,8,20.0,11.0,5,11.0,9.0
364,Stoke,Tottenham,1,1.0,0.0,6,12.0,13.0,3,4.0,...,4.0,2,2.0,2.0,23,13.0,14.0,15,6.0,7.0
365,Fulham,Liverpool,1,2.0,0.0,10,18.0,7.0,8,13.0,...,5.0,3,6.0,0.0,20,11.0,26.0,14,10.0,15.0
366,Everton,West Ham,2,1.0,2.0,23,17.0,14.0,13,10.0,...,11.0,0,1.0,1.0,7,9.0,6.0,2,6.0,4.0
367,Man United,Swansea,2,0.0,3.0,16,10.0,10.0,9,4.0,...,14.0,1,3.0,0.0,5,10.0,11.0,2,6.0,8.0
368,Arsenal,Wigan,4,1.0,0.0,19,19.0,10.0,12,13.0,...,11.0,1,3.0,0.0,10,13.0,18.0,5,8.0,10.0
369,Reading,Man City,0,0.0,0.0,11,13.0,7.0,7,7.0,...,8.0,2,0.0,1.0,27,12.0,11.0,20,8.0,7.0


In [10]:
list(cut_df.columns)

['h',
 'a',
 'h_h_goals_0',
 'h_h_goals_-1',
 'h_h_goals_-2',
 'h_h_shots_0',
 'h_h_shots_-1',
 'h_h_shots_-2',
 'h_h_sot_0',
 'h_h_sot_-1',
 'h_h_sot_-2',
 'h_a_goals_0',
 'h_a_goals_-1',
 'h_a_goals_-2',
 'h_a_shots_0',
 'h_a_shots_-1',
 'h_a_shots_-2',
 'h_a_sot_0',
 'h_a_sot_-1',
 'h_a_sot_-2',
 'a_h_goals_0',
 'a_h_goals_-1',
 'a_h_goals_-2',
 'a_h_shots_0',
 'a_h_shots_-1',
 'a_h_shots_-2',
 'a_h_sot_0',
 'a_h_sot_-1',
 'a_h_sot_-2',
 'a_a_goals_0',
 'a_a_goals_-1',
 'a_a_goals_-2',
 'a_a_shots_0',
 'a_a_shots_-1',
 'a_a_shots_-2',
 'a_a_sot_0',
 'a_a_sot_-1',
 'a_a_sot_-2']

### Drop 0 columns

In [11]:
drop_0_cols = Drop0Columns()
drop_0_cols.fit(records_df)
cut_df = drop_0_cols.transform(records_df)
cut_df.head(20)

Unnamed: 0,h,h_h_goals_-1,h_h_goals_-2,h_h_goals_-3,h_h_goals_-4,h_h_goals_-5,h_h_goals_-6,h_h_goals_-7,h_h_goals_-8,h_h_goals_-9,...,a_a_sot_-8,a_a_sot_-9,a_a_sot_-11,a_a_sot_-12,a_a_sot_-13,a_a_sot_-14,a_a_sot_-15,a_a_sot_-16,a_a_sot_-17,a_a_sot_-18
0,Arsenal,,,,,,,,,,...,,,,,,,,,,
1,Fulham,,,,,,,,,,...,,,,,,,,,,
2,Newcastle,,,,,,,,,,...,,,,,,,,,,
3,QPR,,,,,,,,,,...,,,,,,,,,,
4,Reading,,,,,,,,,,...,,,,,,,,,,
5,West Brom,,,,,,,,,,...,,,,,,,,,,
6,West Ham,,,,,,,,,,...,,,,,,,,,,
7,Man City,,,,,,,,,,...,,,,,,,,,,
8,Wigan,,,,,,,,,,...,,,,,,,,,,
9,Everton,,,,,,,,,,...,,,,,,,,,,


In [3]:
t = str(0)
t.isdigit()

True