# 003.01c Features - Shift Data to Historical Records

In [1]:
import pathlib
import sys

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.mlab as mlab

%matplotlib inline

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
#add the 'src' directory to path to import modules
PROJECT_DIR = pathlib.Path.cwd().resolve().parent
sys.path.append(str(PROJECT_DIR))

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

DATA_DIR = PROJECT_DIR / 'data'
SCOPED_DATA_DIR = DATA_DIR / '03-scoped'
PROCESSED_DATA_DIR = DATA_DIR / '04-processed'

In [2]:
league = 'english-premier-league'
load_fp = SCOPED_DATA_DIR / league / str(league + '-scoped-1.csv')
seasons = ['2009-2010']


In [3]:
df_orig = pd.read_csv(load_fp, parse_dates = ['date'], index_col=None)
df_orig = df_orig[df_orig['season'].isin(seasons)]
# Calculate Results column
conditions = [df_orig['h_ftgoals'] > df_orig['a_ftgoals'],
              df_orig['h_ftgoals'] == df_orig['a_ftgoals'],
              df_orig['h_ftgoals'] < df_orig['a_ftgoals']]
choices = ['hwin', 'draw', 'awin']
df_orig['result'] = np.select(conditions, choices, default='not-played')
df_orig.rename(columns={'odds_hwin_bbmean': 'odds_hwin',
                      'odds_draw_bbmean': 'odds_draw',
                      'odds_awin_bbmean': 'odds_awin'}, inplace=True)
keeper_cols = ['season', 'date', 'h', 'a','h_shots', 'a_shots', 'result']#,
#               'odds_hwin', 'odds_draw', 'odds_awin']
df_orig = df_orig[keeper_cols]
df_orig.head()

Unnamed: 0,season,date,h,a,h_shots,a_shots,result
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,awin
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,awin
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,awin
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,hwin
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,awin


In [4]:
df_bc = df_orig.copy(deep=True)
df_bc.head()

Unnamed: 0,season,date,h,a,h_shots,a_shots,result
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,awin
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,awin
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,awin
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,hwin
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,awin


In [5]:
df_bc.tail()

Unnamed: 0,season,date,h,a,h_shots,a_shots,result
375,2009-2010,2010-05-09,west-ham-united,manchester-city,12.0,17.0,draw
376,2009-2010,2010-05-09,manchester-united,stoke-city,18.0,4.0,hwin
377,2009-2010,2010-05-09,wolverhampton-wanderers,sunderland,14.0,11.0,hwin
378,2009-2010,2010-05-09,hull-city,liverpool,11.0,19.0,draw
379,2009-2010,2010-05-09,everton,portsmouth,21.0,10.0,hwin


In [6]:
# Home Team Home Record
home_team_home_games = []
teams = df_bc['h'].unique()
for team in teams:
    team_home_game = df_bc[df_bc['h'] == team]
    home_team_home_games.append(team_home_game)

home_team_home_games[0]

Unnamed: 0,season,date,h,a,h_shots,a_shots,result
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,awin
33,2009-2010,2009-08-30,aston-villa,fulham,9.0,6.0,hwin
47,2009-2010,2009-09-19,aston-villa,portsmouth,7.0,17.0,hwin
75,2009-2010,2009-10-05,aston-villa,manchester-city,12.0,9.0,draw
78,2009-2010,2009-10-17,aston-villa,chelsea,7.0,21.0,hwin
107,2009-2010,2009-11-07,aston-villa,bolton-wanderers,19.0,13.0,hwin
129,2009-2010,2009-11-28,aston-villa,tottenham-hotspur,12.0,23.0,draw
139,2009-2010,2009-12-05,aston-villa,hull-city,13.0,5.0,hwin
168,2009-2010,2009-12-19,aston-villa,stoke-city,17.0,13.0,hwin
192,2009-2010,2009-12-29,aston-villa,liverpool,11.0,12.0,awin


In [7]:
# Get Home Team Home Record
lagged_home_team_home_games = []
lags = range(1,len(home_team_home_games[0]))
feature_cols = ['h_shots']
for home_team_home_game in home_team_home_games:
    lagged_df = home_team_home_game.assign(**{f'h_{col}_-{n}': home_team_home_game[col].shift(n) \
                                              for n in lags \
                                              for col in feature_cols})
    lagged_home_team_home_games.append(lagged_df)
    
#df2 = chelsea_home_df.assign(**{f'h_{col}_-{n}': chelsea_home_df[col].shift(n) for n in lags for col in ['h_shots']})
# chelsea_home_df_1 = pd.concat([chelsea_home_df, chelsea_home_df['h_shots'].shift().add_prefix('prev_')], 0)
# chelsea_home_df_1
lagged_home_team_home_games[0]

Unnamed: 0,season,date,h,a,h_shots,a_shots,result,h_h_shots_-1,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,awin,,,,,,,,,,,,,,,,,,
33,2009-2010,2009-08-30,aston-villa,fulham,9.0,6.0,hwin,11.0,,,,,,,,,,,,,,,,,
47,2009-2010,2009-09-19,aston-villa,portsmouth,7.0,17.0,hwin,9.0,11.0,,,,,,,,,,,,,,,,
75,2009-2010,2009-10-05,aston-villa,manchester-city,12.0,9.0,draw,7.0,9.0,11.0,,,,,,,,,,,,,,,
78,2009-2010,2009-10-17,aston-villa,chelsea,7.0,21.0,hwin,12.0,7.0,9.0,11.0,,,,,,,,,,,,,,
107,2009-2010,2009-11-07,aston-villa,bolton-wanderers,19.0,13.0,hwin,7.0,12.0,7.0,9.0,11.0,,,,,,,,,,,,,
129,2009-2010,2009-11-28,aston-villa,tottenham-hotspur,12.0,23.0,draw,19.0,7.0,12.0,7.0,9.0,11.0,,,,,,,,,,,,
139,2009-2010,2009-12-05,aston-villa,hull-city,13.0,5.0,hwin,12.0,19.0,7.0,12.0,7.0,9.0,11.0,,,,,,,,,,,
168,2009-2010,2009-12-19,aston-villa,stoke-city,17.0,13.0,hwin,13.0,12.0,19.0,7.0,12.0,7.0,9.0,11.0,,,,,,,,,,
192,2009-2010,2009-12-29,aston-villa,liverpool,11.0,12.0,awin,17.0,13.0,12.0,19.0,7.0,12.0,7.0,9.0,11.0,,,,,,,,,


In [8]:
# Away Team Away Record
away_team_away_games = []
teams = df_bc['h'].unique()
for team in teams:
    team_away_game = df_bc[df_bc['a'] == team]
    away_team_away_games.append(team_away_game)

away_team_away_games[0]

Unnamed: 0,season,date,h,a,h_shots,a_shots,result
25,2009-2010,2009-08-24,liverpool,aston-villa,21.0,7.0,awin
44,2009-2010,2009-09-13,birmingham-city,aston-villa,11.0,12.0,awin
57,2009-2010,2009-09-26,blackburn-rovers,aston-villa,15.0,10.0,hwin
90,2009-2010,2009-10-24,wolverhampton-wanderers,aston-villa,8.0,12.0,draw
99,2009-2010,2009-10-31,everton,aston-villa,9.0,13.0,draw
106,2009-2010,2009-11-04,west-ham-united,aston-villa,14.0,13.0,hwin
123,2009-2010,2009-11-21,burnley,aston-villa,11.0,10.0,draw
154,2009-2010,2009-12-12,manchester-united,aston-villa,16.0,8.0,awin
162,2009-2010,2009-12-15,sunderland,aston-villa,15.0,11.0,awin
184,2009-2010,2009-12-27,arsenal,aston-villa,19.0,7.0,hwin


In [9]:
# Get Away Team away Record
lagged_away_team_away_games = []
lags = range(1,len(away_team_away_games[0]))
feature_cols = ['a_shots']
for away_team_away_game in away_team_away_games:
    lagged_df = away_team_away_game.assign(**{f'a_{col}_-{n}': away_team_away_game[col].shift(n) \
                                              for n in lags \
                                              for col in feature_cols})
    lagged_away_team_away_games.append(lagged_df)
    
#df2 = chelsea_home_df.assign(**{f'h_{col}_-{n}': chelsea_home_df[col].shift(n) for n in lags for col in ['h_shots']})
# chelsea_home_df_1 = pd.concat([chelsea_home_df, chelsea_home_df['h_shots'].shift().add_prefix('prev_')], 0)
# chelsea_home_df_1
lagged_away_team_away_games[0]

Unnamed: 0,season,date,h,a,h_shots,a_shots,result,a_a_shots_-1,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18
25,2009-2010,2009-08-24,liverpool,aston-villa,21.0,7.0,awin,,,,,,,,,,,,,,,,,,
44,2009-2010,2009-09-13,birmingham-city,aston-villa,11.0,12.0,awin,7.0,,,,,,,,,,,,,,,,,
57,2009-2010,2009-09-26,blackburn-rovers,aston-villa,15.0,10.0,hwin,12.0,7.0,,,,,,,,,,,,,,,,
90,2009-2010,2009-10-24,wolverhampton-wanderers,aston-villa,8.0,12.0,draw,10.0,12.0,7.0,,,,,,,,,,,,,,,
99,2009-2010,2009-10-31,everton,aston-villa,9.0,13.0,draw,12.0,10.0,12.0,7.0,,,,,,,,,,,,,,
106,2009-2010,2009-11-04,west-ham-united,aston-villa,14.0,13.0,hwin,13.0,12.0,10.0,12.0,7.0,,,,,,,,,,,,,
123,2009-2010,2009-11-21,burnley,aston-villa,11.0,10.0,draw,13.0,13.0,12.0,10.0,12.0,7.0,,,,,,,,,,,,
154,2009-2010,2009-12-12,manchester-united,aston-villa,16.0,8.0,awin,10.0,13.0,13.0,12.0,10.0,12.0,7.0,,,,,,,,,,,
162,2009-2010,2009-12-15,sunderland,aston-villa,15.0,11.0,awin,8.0,10.0,13.0,13.0,12.0,10.0,12.0,7.0,,,,,,,,,,
184,2009-2010,2009-12-27,arsenal,aston-villa,19.0,7.0,hwin,11.0,8.0,10.0,13.0,13.0,12.0,10.0,12.0,7.0,,,,,,,,,


In [10]:
hh_games = pd.concat([*lagged_home_team_home_games],
                     axis=0, join='inner', sort=True)
hh_games.sort_index(inplace=True)
hh_games.head(10)

Unnamed: 0,a,a_shots,date,h,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,h_shots,result,season
0,wigan-athletic,14.0,2009-08-15,aston-villa,,,,,,,,,,,,,,,,,,,11.0,awin,2009-2010
1,manchester-city,8.0,2009-08-15,blackburn-rovers,,,,,,,,,,,,,,,,,,,17.0,awin,2009-2010
2,sunderland,20.0,2009-08-15,bolton-wanderers,,,,,,,,,,,,,,,,,,,11.0,awin,2009-2010
3,hull-city,7.0,2009-08-15,chelsea,,,,,,,,,,,,,,,,,,,26.0,hwin,2009-2010
4,arsenal,15.0,2009-08-15,everton,,,,,,,,,,,,,,,,,,,8.0,awin,2009-2010
5,fulham,9.0,2009-08-15,portsmouth,,,,,,,,,,,,,,,,,,,16.0,awin,2009-2010
6,burnley,9.0,2009-08-15,stoke-city,,,,,,,,,,,,,,,,,,,12.0,hwin,2009-2010
7,west-ham-united,16.0,2009-08-15,wolverhampton-wanderers,,,,,,,,,,,,,,,,,,,19.0,awin,2009-2010
8,birmingham-city,6.0,2009-08-16,manchester-united,,,,,,,,,,,,,,,,,,,26.0,hwin,2009-2010
9,liverpool,6.0,2009-08-16,tottenham-hotspur,,,,,,,,,,,,,,,,,,,17.0,hwin,2009-2010


In [11]:
aa_games = pd.concat([*lagged_away_team_away_games],
                     axis=0, join='inner', sort=True)
aa_games.sort_index(inplace=True)
aa_games.tail(10)

Unnamed: 0,a,a_a_shots_-1,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9,a_shots,date,h,h_shots,result,season
370,tottenham-hotspur,10.0,7.0,19.0,23.0,7.0,9.0,17.0,9.0,17.0,18.0,9.0,12.0,16.0,13.0,10.0,21.0,7.0,6.0,16.0,2010-05-09,burnley,16.0,hwin,2009-2010
371,fulham,8.0,3.0,12.0,12.0,7.0,11.0,10.0,3.0,6.0,9.0,2.0,12.0,8.0,3.0,7.0,9.0,14.0,14.0,5.0,2010-05-09,arsenal,16.0,hwin,2009-2010
372,wigan-athletic,12.0,8.0,11.0,11.0,8.0,13.0,13.0,8.0,12.0,14.0,12.0,9.0,13.0,11.0,14.0,13.0,20.0,11.0,4.0,2010-05-09,chelsea,17.0,hwin,2009-2010
373,birmingham-city,16.0,2.0,10.0,13.0,5.0,3.0,10.0,14.0,11.0,6.0,5.0,8.0,21.0,10.0,9.0,9.0,5.0,9.0,10.0,2010-05-09,bolton-wanderers,14.0,hwin,2009-2010
374,blackburn-rovers,12.0,8.0,14.0,7.0,15.0,2.0,5.0,7.0,7.0,18.0,16.0,12.0,10.0,12.0,9.0,8.0,8.0,8.0,9.0,2010-05-09,aston-villa,16.0,awin,2009-2010
375,manchester-city,3.0,9.0,14.0,9.0,14.0,5.0,9.0,10.0,12.0,8.0,20.0,14.0,13.0,8.0,9.0,17.0,8.0,12.0,17.0,2010-05-09,west-ham-united,12.0,draw,2009-2010
376,stoke-city,5.0,13.0,5.0,7.0,7.0,8.0,8.0,10.0,13.0,6.0,3.0,9.0,12.0,8.0,14.0,10.0,4.0,7.0,4.0,2010-05-09,manchester-united,18.0,hwin,2009-2010
377,sunderland,10.0,7.0,8.0,9.0,14.0,14.0,4.0,8.0,9.0,20.0,9.0,7.0,10.0,6.0,7.0,5.0,7.0,13.0,11.0,2010-05-09,wolverhampton-wanderers,14.0,hwin,2009-2010
378,liverpool,12.0,7.0,15.0,9.0,6.0,15.0,12.0,19.0,26.0,6.0,16.0,4.0,11.0,6.0,9.0,6.0,4.0,12.0,19.0,2010-05-09,hull-city,11.0,draw,2009-2010
379,portsmouth,13.0,7.0,9.0,14.0,9.0,8.0,12.0,17.0,9.0,9.0,6.0,8.0,7.0,13.0,7.0,12.0,10.0,11.0,10.0,2010-05-09,everton,21.0,hwin,2009-2010


In [12]:
len(aa_games)

380

In [13]:
hhaa_games = pd.concat([hh_games, aa_games], axis=1, join='inner', sort=True)
hhaa_games.sort_index(inplace=True)
#Drop duplicate columns such as h, a etc
hhaa_games = hhaa_games.loc[:,~hhaa_games.columns.duplicated()]
hhaa_games.head(10)

Unnamed: 0,a,a_shots,date,h,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,h_shots,result,season,a_a_shots_-1,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9
0,wigan-athletic,14.0,2009-08-15,aston-villa,,,,,,,,,,,,,,,,,,,11.0,awin,2009-2010,,,,,,,,,,,,,,,,,,
1,manchester-city,8.0,2009-08-15,blackburn-rovers,,,,,,,,,,,,,,,,,,,17.0,awin,2009-2010,,,,,,,,,,,,,,,,,,
2,sunderland,20.0,2009-08-15,bolton-wanderers,,,,,,,,,,,,,,,,,,,11.0,awin,2009-2010,,,,,,,,,,,,,,,,,,
3,hull-city,7.0,2009-08-15,chelsea,,,,,,,,,,,,,,,,,,,26.0,hwin,2009-2010,,,,,,,,,,,,,,,,,,
4,arsenal,15.0,2009-08-15,everton,,,,,,,,,,,,,,,,,,,8.0,awin,2009-2010,,,,,,,,,,,,,,,,,,
5,fulham,9.0,2009-08-15,portsmouth,,,,,,,,,,,,,,,,,,,16.0,awin,2009-2010,,,,,,,,,,,,,,,,,,
6,burnley,9.0,2009-08-15,stoke-city,,,,,,,,,,,,,,,,,,,12.0,hwin,2009-2010,,,,,,,,,,,,,,,,,,
7,west-ham-united,16.0,2009-08-15,wolverhampton-wanderers,,,,,,,,,,,,,,,,,,,19.0,awin,2009-2010,,,,,,,,,,,,,,,,,,
8,birmingham-city,6.0,2009-08-16,manchester-united,,,,,,,,,,,,,,,,,,,26.0,hwin,2009-2010,,,,,,,,,,,,,,,,,,
9,liverpool,6.0,2009-08-16,tottenham-hotspur,,,,,,,,,,,,,,,,,,,17.0,hwin,2009-2010,,,,,,,,,,,,,,,,,,


In [14]:
hhaa_games.tail(25)

Unnamed: 0,a,a_shots,date,h,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,h_shots,result,season,a_a_shots_-1,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9
355,liverpool,12.0,2010-04-25,burnley,14.0,10.0,11.0,11.0,12.0,15.0,10.0,8.0,8.0,,7.0,22.0,10.0,14.0,8.0,5.0,15.0,8.0,12.0,awin,2009-2010,16.0,15.0,9.0,6.0,15.0,12.0,19.0,26.0,6.0,,4.0,11.0,6.0,9.0,6.0,4.0,12.0,7.0
356,stoke-city,3.0,2010-04-25,chelsea,18.0,25.0,23.0,12.0,26.0,12.0,25.0,27.0,26.0,,13.0,22.0,23.0,9.0,25.0,23.0,18.0,21.0,29.0,hwin,2009-2010,9.0,7.0,7.0,8.0,8.0,10.0,13.0,6.0,,,12.0,8.0,14.0,10.0,4.0,7.0,13.0,5.0
357,fulham,8.0,2010-04-25,everton,10.0,15.0,12.0,9.0,13.0,24.0,18.0,26.0,8.0,,18.0,18.0,11.0,7.0,11.0,13.0,24.0,19.0,15.0,hwin,2009-2010,2.0,12.0,12.0,7.0,11.0,10.0,3.0,6.0,9.0,,12.0,8.0,3.0,7.0,9.0,14.0,14.0,3.0
358,aston-villa,10.0,2010-05-01,manchester-city,8.0,11.0,13.0,17.0,13.0,20.0,20.0,10.0,16.0,,17.0,10.0,12.0,4.0,8.0,13.0,12.0,12.0,17.0,hwin,2009-2010,11.0,11.0,8.0,10.0,13.0,13.0,12.0,10.0,12.0,7.0,12.0,13.0,7.0,18.0,10.0,10.0,9.0,7.0
359,everton,11.0,2010-05-01,stoke-city,11.0,12.0,14.0,9.0,10.0,11.0,2.0,4.0,13.0,12.0,9.0,8.0,13.0,4.0,8.0,15.0,6.0,11.0,12.0,draw,2009-2010,11.0,12.0,10.0,5.0,11.0,9.0,17.0,12.0,6.0,17.0,13.0,22.0,10.0,13.0,14.0,6.0,10.0,9.0
360,burnley,12.0,2010-05-01,birmingham-city,10.0,13.0,16.0,3.0,7.0,19.0,12.0,11.0,8.0,9.0,8.0,7.0,7.0,10.0,15.0,12.0,6.0,11.0,13.0,hwin,2009-2010,9.0,10.0,14.0,16.0,6.0,9.0,10.0,6.0,3.0,9.0,9.0,13.0,8.0,13.0,10.0,10.0,11.0,12.0
361,bolton-wanderers,11.0,2010-05-01,tottenham-hotspur,20.0,19.0,18.0,28.0,10.0,22.0,18.0,11.0,20.0,17.0,13.0,16.0,14.0,13.0,28.0,13.0,25.0,21.0,25.0,hwin,2009-2010,13.0,7.0,11.0,17.0,4.0,13.0,15.0,10.0,14.0,20.0,7.0,6.0,7.0,15.0,10.0,8.0,11.0,4.0
362,wolverhampton-wanderers,18.0,2010-05-01,portsmouth,16.0,14.0,9.0,20.0,21.0,23.0,16.0,21.0,11.0,16.0,6.0,10.0,18.0,11.0,15.0,12.0,14.0,10.0,12.0,hwin,2009-2010,5.0,4.0,3.0,4.0,11.0,12.0,15.0,9.0,10.0,9.0,3.0,5.0,7.0,7.0,6.0,12.0,13.0,6.0
363,west-ham-united,7.0,2010-05-02,fulham,11.0,16.0,12.0,8.0,10.0,17.0,18.0,14.0,4.0,,7.0,10.0,11.0,16.0,14.0,15.0,15.0,10.0,5.0,hwin,2009-2010,6.0,9.0,14.0,16.0,15.0,12.0,12.0,12.0,11.0,16.0,10.0,8.0,4.0,10.0,20.0,12.0,8.0,3.0
364,chelsea,15.0,2010-05-02,liverpool,17.0,12.0,6.0,10.0,27.0,11.0,22.0,27.0,21.0,18.0,18.0,25.0,23.0,12.0,9.0,13.0,13.0,14.0,7.0,awin,2009-2010,9.0,20.0,10.0,7.0,27.0,21.0,14.0,18.0,12.0,20.0,11.0,20.0,14.0,7.0,14.0,21.0,16.0,27.0


In [15]:
len(hhaa_games)

380

In [16]:
hhaa_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380 entries, 0 to 379
Data columns (total 43 columns):
a                380 non-null object
a_shots          380 non-null float64
date             380 non-null datetime64[ns]
h                380 non-null object
h_h_shots_-1     360 non-null float64
h_h_shots_-10    180 non-null float64
h_h_shots_-11    160 non-null float64
h_h_shots_-12    140 non-null float64
h_h_shots_-13    120 non-null float64
h_h_shots_-14    100 non-null float64
h_h_shots_-15    80 non-null float64
h_h_shots_-16    60 non-null float64
h_h_shots_-17    40 non-null float64
h_h_shots_-18    20 non-null float64
h_h_shots_-2     340 non-null float64
h_h_shots_-3     320 non-null float64
h_h_shots_-4     300 non-null float64
h_h_shots_-5     280 non-null float64
h_h_shots_-6     260 non-null float64
h_h_shots_-7     240 non-null float64
h_h_shots_-8     220 non-null float64
h_h_shots_-9     200 non-null float64
h_shots          380 non-null float64
result           380

### Putting Code Together

In [17]:
# Form hhaa Records DataFrame
teams = df_bc['h'].unique()

# Get Dataframe by team game home and away
home_team_home_games = []
away_team_away_games = []
for team in teams:
    team_home_game = df_bc[df_bc['h'] == team]
    home_team_home_games.append(team_home_game)
    
    team_away_game = df_bc[df_bc['a'] == team]
    away_team_away_games.append(team_away_game)

# Form historical record dataframes for each team
lags = range(1,len(home_team_home_games[0]))

lagged_home_team_home_games = []
lagged_away_team_away_games = []
feature_cols = ['h_shots']

for home_team_home_game in home_team_home_games:
    lagged_df = home_team_home_game.assign(**{f'h_{col}_-{n}': home_team_home_game[col].shift(n) \
                                              for n in lags \
                                              for col in feature_cols})
    lagged_home_team_home_games.append(lagged_df)
    
for away_team_away_game in away_team_away_games:
    lagged_df = away_team_away_game.assign(**{f'a_{col}_-{n}': away_team_away_game[col].shift(n) \
                                              for n in lags \
                                              for col in feature_cols})
    lagged_away_team_away_games.append(lagged_df)
    
# Join DatafRames together
hh_games = pd.concat([*lagged_home_team_home_games], axis=0, join='inner', sort=True)
hh_games.sort_index(inplace=True)

aa_games = pd.concat([*lagged_away_team_away_games], axis=0, join='inner', sort=True)
aa_games.sort_index(inplace=True)

hhaa_games = pd.concat([hh_games, aa_games], axis=1, join='inner', sort=True)
hhaa_games.sort_index(inplace=True)
#Drop duplicate columns such as h, a etc
hhaa_games = hhaa_games.loc[:,~hhaa_games.columns.duplicated()]

# Re-arrange column sequnce to more intuitive
for col in reversed(['season', 'date', 'h', 'a', 'h_shots', 'a_shots']):
    ser = hhaa_games[col]
    hhaa_games.drop(labels= col, axis=1,inplace = True)
    hhaa_games.insert(0, col, ser)

hhaa_games.head(10)

Unnamed: 0,season,date,h,a,h_shots,a_shots,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,result,a_h_shots_-1,a_h_shots_-10,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-2,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
5,2009-2010,2009-08-15,portsmouth,fulham,16.0,9.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
6,2009-2010,2009-08-15,stoke-city,burnley,12.0,9.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,
7,2009-2010,2009-08-15,wolverhampton-wanderers,west-ham-united,19.0,16.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
8,2009-2010,2009-08-16,manchester-united,birmingham-city,26.0,6.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,
9,2009-2010,2009-08-16,tottenham-hotspur,liverpool,17.0,6.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,


In [18]:
hhaa_games.tail(25)

Unnamed: 0,season,date,h,a,h_shots,a_shots,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,result,a_h_shots_-1,a_h_shots_-10,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-2,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9
355,2009-2010,2010-04-25,burnley,liverpool,12.0,12.0,14.0,10.0,11.0,11.0,12.0,15.0,10.0,8.0,8.0,,7.0,22.0,10.0,14.0,8.0,5.0,15.0,8.0,awin,8.0,9.0,12.0,10.0,13.0,12.0,8.0,8.0,17.0,,10.0,12.0,4.0,11.0,6.0,6.0,11.0,14.0
356,2009-2010,2010-04-25,chelsea,stoke-city,29.0,3.0,18.0,25.0,23.0,12.0,26.0,12.0,25.0,27.0,26.0,,13.0,22.0,23.0,9.0,25.0,23.0,18.0,21.0,hwin,10.0,11.0,16.0,22.0,24.0,11.0,8.0,18.0,,,17.0,10.0,15.0,16.0,6.0,12.0,17.0,12.0
357,2009-2010,2010-04-25,everton,fulham,15.0,8.0,10.0,15.0,12.0,9.0,13.0,24.0,18.0,26.0,8.0,,18.0,18.0,11.0,7.0,11.0,13.0,24.0,19.0,hwin,18.0,10.0,3.0,19.0,20.0,20.0,12.0,9.0,16.0,,7.0,33.0,12.0,16.0,13.0,11.0,11.0,18.0
358,2009-2010,2010-05-01,manchester-city,aston-villa,17.0,10.0,8.0,11.0,13.0,17.0,13.0,20.0,20.0,10.0,16.0,,17.0,10.0,12.0,4.0,8.0,13.0,12.0,12.0,hwin,14.0,15.0,16.0,11.0,14.0,9.0,8.0,15.0,11.0,21.0,16.0,14.0,13.0,10.0,13.0,28.0,15.0,19.0
359,2009-2010,2010-05-01,stoke-city,everton,12.0,11.0,11.0,12.0,14.0,9.0,10.0,11.0,2.0,4.0,13.0,12.0,9.0,8.0,13.0,4.0,8.0,15.0,6.0,11.0,draw,15.0,10.0,25.0,8.0,18.0,16.0,12.0,16.0,14.0,8.0,10.0,9.0,12.0,7.0,13.0,9.0,10.0,6.0
360,2009-2010,2010-05-01,birmingham-city,burnley,13.0,12.0,10.0,13.0,16.0,3.0,7.0,19.0,12.0,11.0,8.0,9.0,8.0,7.0,7.0,10.0,15.0,12.0,6.0,11.0,hwin,16.0,12.0,9.0,9.0,13.0,12.0,18.0,27.0,27.0,12.0,11.0,21.0,20.0,15.0,16.0,14.0,24.0,24.0
361,2009-2010,2010-05-01,tottenham-hotspur,bolton-wanderers,25.0,11.0,20.0,19.0,18.0,28.0,10.0,22.0,18.0,11.0,20.0,17.0,13.0,16.0,14.0,13.0,28.0,13.0,25.0,21.0,hwin,11.0,20.0,15.0,12.0,12.0,19.0,13.0,12.0,21.0,12.0,18.0,18.0,12.0,18.0,12.0,10.0,8.0,13.0
362,2009-2010,2010-05-01,portsmouth,wolverhampton-wanderers,12.0,18.0,16.0,14.0,9.0,20.0,21.0,23.0,16.0,21.0,11.0,16.0,6.0,10.0,18.0,11.0,15.0,12.0,14.0,10.0,hwin,11.0,12.0,18.0,23.0,10.0,13.0,10.0,13.0,16.0,18.0,23.0,17.0,11.0,22.0,16.0,15.0,8.0,14.0
363,2009-2010,2010-05-02,fulham,west-ham-united,5.0,7.0,11.0,16.0,12.0,8.0,10.0,17.0,18.0,14.0,4.0,,7.0,10.0,11.0,16.0,14.0,15.0,15.0,10.0,hwin,17.0,23.0,16.0,12.0,17.0,11.0,20.0,12.0,19.0,19.0,10.0,9.0,22.0,12.0,8.0,14.0,17.0,21.0
364,2009-2010,2010-05-02,liverpool,chelsea,7.0,15.0,17.0,12.0,6.0,10.0,27.0,11.0,22.0,27.0,21.0,18.0,18.0,25.0,23.0,12.0,9.0,13.0,13.0,14.0,awin,20.0,6.0,13.0,5.0,15.0,7.0,15.0,4.0,4.0,4.0,9.0,10.0,8.0,11.0,7.0,7.0,5.0,11.0


In [19]:
hhaa_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380 entries, 0 to 379
Data columns (total 43 columns):
season           380 non-null object
date             380 non-null datetime64[ns]
h                380 non-null object
a                380 non-null object
h_shots          380 non-null float64
a_shots          380 non-null float64
h_h_shots_-1     360 non-null float64
h_h_shots_-10    180 non-null float64
h_h_shots_-11    160 non-null float64
h_h_shots_-12    140 non-null float64
h_h_shots_-13    120 non-null float64
h_h_shots_-14    100 non-null float64
h_h_shots_-15    80 non-null float64
h_h_shots_-16    60 non-null float64
h_h_shots_-17    40 non-null float64
h_h_shots_-18    20 non-null float64
h_h_shots_-2     340 non-null float64
h_h_shots_-3     320 non-null float64
h_h_shots_-4     300 non-null float64
h_h_shots_-5     280 non-null float64
h_h_shots_-6     260 non-null float64
h_h_shots_-7     240 non-null float64
h_h_shots_-8     220 non-null float64
h_h_shots_-9     200 

In [20]:
stop

NameError: name 'stop' is not defined

In [None]:
# Form hhaa Records DataFrame
teams = df_bc['h'].unique()

# Get Dataframe by team game home and away
home_team_away_games = []
away_team_home_games = []
for team in teams:
    team_home_game = df_bc[df_bc['h'] == team]
    home_team_home_games.append(team_home_game)
    
    team_away_game = df_bc[df_bc['a'] == team]
    away_team_away_games.append(team_away_game)

# Form historical record dataframes for each team
lags = range(1,len(home_team_home_games[0]))

lagged_home_team_home_games = []
lagged_away_team_away_games = []
feature_cols = ['h_shots']

for home_team_home_game in home_team_home_games:
    lagged_df = home_team_home_game.assign(**{f'h_{col}_-{n}': home_team_home_game[col].shift(n) \
                                              for n in lags \
                                              for col in feature_cols})
    lagged_home_team_home_games.append(lagged_df)
    
for away_team_away_game in away_team_away_games:
    lagged_df = away_team_away_game.assign(**{f'a_{col}_-{n}': away_team_away_game[col].shift(n) \
                                              for n in lags \
                                              for col in feature_cols})
    lagged_away_team_away_games.append(lagged_df)
    
# Join DatafRames together
hh_games = pd.concat([*lagged_home_team_home_games], axis=0, join='inner', sort=True)
hh_games.sort_index(inplace=True)

aa_games = pd.concat([*lagged_away_team_away_games], axis=0, join='inner', sort=True)
aa_games.sort_index(inplace=True)

hhaa_games = pd.concat([hh_games, aa_games], axis=1, join='inner', sort=True)
hhaa_games.sort_index(inplace=True)
#Drop duplicate columns such as h, a etc
hhaa_games = hhaa_games.loc[:,~hhaa_games.columns.duplicated()]
hhaa_games.head(10)

In [None]:
df2s=[]
for n, _ in enumerate(lagged_home_team_home_games):
    df2 = pd.concat([lagged_home_team_home_games[n],
                     lagged_away_team_away_games[n]],
                     axis=1, sort=True)
        # Drop any duplicate columns - h and a get duplicated as a multiple of number of features
    df2 = df2.loc[:,~df2.columns.duplicated()]
    df2.sort_index(inplace=True)
    df2s.append(df2)
#df2s[0].head()
# df2 = df2.sort_index()
# #df= df['']
# h = df2['h']
# df2.drop(labels=['h'], axis=1,inplace = True)
# df2.insert(0, 'h', h)
df2s[0].tail()#[100:120]

In [None]:
df2s[0].index

In [None]:
df2s[1].tail()

In [None]:
df2s[1].index

In [None]:
full_df = pd.concat(df2s, axis=0, join='inner', sort=True)
full_df.sort_index(inplace=True)
full_df.tail(15)

In [None]:
df.groupby('hostname')[['period', 'Teff']].first().reset_index()

In [None]:
full_df.index

In [None]:
len(full_df.index)

In [None]:
stop

In [None]:
teams = df_bc['h'].unique()
for team in teams:
    team_away_games = df_bc[df_bc['a'] == team]
    chelsea_away_df = team_away_games
    chelsea_away_df = chelsea_away_df.iloc[0:5]
    chelsea_away_df = chelsea_away_df[['date', 'h', 'a', 'h_shots', 'a_shots']]
    break

chelsea_away_df

In [None]:
# Get Away Team Away Record
lags = range(1,len(chelsea_away_df))
df3 = chelsea_away_df.assign(**{f'a_{col}_-{n}': chelsea_away_df[col].shift(n) for n in lags for col in ['a_shots']})
# chelsea_home_df_1 = pd.concat([chelsea_home_df, chelsea_home_df['h_shots'].shift().add_prefix('prev_')], 0)
# chelsea_home_df_1
df3

In [None]:
# Get Home Team away Record

In [None]:
stop

### Shape Season to historical data

In [None]:
def get_historical_records(group, home_or_away):
    features = [col for col in group.columns if '_' in col]
    for feat in features:
        new_feat_cols = [feat + '_' + str(n) for n, feat in zip(range (0,-len(group), -1), [feat]*len(group))]
        for shift_n, new_feat_col in enumerate(new_feat_cols):
            group[new_feat_col] = group[feat]
            group[new_feat_col] = group[new_feat_col].shift(shift_n)
        group.drop(columns=[feat], inplace=True)
    return group


def get_records(df, home_or_away, loc_record):
    feature_cols = [col for col in df.columns if loc_record in col]
    feature_cols.insert(0, home_or_away)
    cut_df = df[feature_cols]
    records = cut_df.groupby(by=home_or_away, sort=True).apply(get_historical_records, home_or_away)
    return records
    
def form_historical_records(df):
    h_teams = df['h'].values
    a_teams = df['a'].values
    # Add
    dates = df['date'].values
    hwin_odds = df['odds_hwin'].values ; draw_odds = df['odds_draw'].values ; awin_odds = df['odds_awin'].values
    result = df['result'].values
    record_dfs = []
    features = []
    for home_or_away in ['h', 'a']:
        for loc_record in ['h_', 'a_']:
            record_df = get_records(df, home_or_away, loc_record)
            new_cols = {col: home_or_away + '_' + col for col in record_df.columns if col not in home_or_away}
            record_df.rename(columns=new_cols, inplace=True)
            record_dfs.append(record_df)
            features.extend([col for col in record_df.columns if loc_record in col])

    full_records = pd.concat(record_dfs, axis=1, sort=True)
    # Drop any duplicate columns - h and a get duplicated as a multiple of number of features
    full_records = full_records.loc[:,~full_records.columns.duplicated()]
    full_records['date'] = dates
    full_records['odds_hwin'] = hwin_odds
    full_records['odds_draw'] = draw_odds
    full_records['odds_awin'] = awin_odds
    full_records['result'] = result

    return full_records

In [None]:
df = df_orig.copy(deep=True)
df_hist = form_historical_records(df)
df_hist.head()

In [None]:
df_bc.head(10)

In [None]:
df_hist_bc = df_hist[crit1 | crit2]
df_hist_bc.head(10)

In [None]:
stop

In [None]:
df.tail()

In [None]:
save_fp = PROCESSED_DATA_DIR / league / get_season[0] / str(league + '--' + get_season[0] + '.csv')
df.to_csv(save_fp, index=False)

In [None]:
df_load = pd.read_csv(save_fp, parse_dates = ['date'], index_col=None)
df_load.head()

In [None]:
df_load.tail()

In [None]:
df_load.info(verbose=True, null_counts=True)

In [None]:
df_load.describe()

In [None]:
df_load.iloc[44:50]

## Functions to Cut Data to Defined Shape prior to PreProcessing

In [None]:
import re

def get_n_past_games(df_orig, past_games=2, dropna_rows=True):
    """
    Restrict historical records to the last n games
    If n = 2, then will get
    home team
        last 2 games played at home ie. h_h_feature-1, h_h_feature-2
        last 2 games played away ie h_a_feature-1, h_a_feature-2
    away team
        last 2 games played at home ie a_h_feature-1, a_h_feature-2
        last 2 games played away ie a_a_feature-1, a_a_feature-2
    where -1 is the most recent game prior to the current game, and -2 is the game before that
    the current games result is in result
    """
    df = df_orig.copy(deep=True)
    keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
               and int(col.split('_')[-1]) > -(hist_games+1) \
               or col.isalpha() \
               or 'odds' in col]
    
    # Rename the target columns to start with prefix 'res_'
    res_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
                and int(col.split('_')[-1]) == 0]
    res_cols.append('result')
    res_cols = [col.replace('0', 'res') for col in res_cols]
    print(res_cols)
    print(res_cols)
    df = df[keeper_cols]
    return df
    
df_cut = get_n_past_games(df_load)
# # Restrict Historical records to last 4 games per team
# df_cut = df_load.copy(deep=True)
# hist_games = 2
# keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
#                and int(col.split('_')[-1]) > -(hist_games+1) \
#                and int(col.split('_')[-1]) < 0 \
#                or col.isalpha() \
#                or 'odds' in col]
# df_cut = df_cut[keeper_cols]
df_cut.head()

In [None]:
import re

def get_n_past_games(df_orig, past_games=2, dropna_rows=True):
    """
    Restrict historical records to the last n games
    If n = 2, then will get
    home team
        last 2 games played at home ie. h_h_feature-1, h_h_feature-2
        last 2 games played away ie h_a_feature-1, h_a_feature-2
    away team
        last 2 games played at home ie a_h_feature-1, a_h_feature-2
        last 2 games played away ie a_a_feature-1, a_a_feature-2
    where -1 is the most recent game prior to the current game, and -2 is the game before that
    the current games result is in result
    """
    df = df_orig.copy(deep=True)
    keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
               and int(col.split('_')[-1]) > -(hist_games+1) \
               and int(col.split('_')[-1]) < 0 \
               or col.isalpha() \
               or 'odds' in col]
df_cut = df_cut[keeper_cols]
    

# Restrict Historical records to last 4 games per team
df_cut = df_load.copy(deep=True)
hist_games = 2
keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
               and int(col.split('_')[-1]) > -(hist_games+1) \
               and int(col.split('_')[-1]) < 0 \
               or col.isalpha() \
               or 'odds' in col]
df_cut = df_cut[keeper_cols]
df_cut.head(10)