# 003.01e Features - Shift Data to Historical Records

In [1]:
import pathlib
import sys

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.mlab as mlab

%matplotlib inline

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
#add the 'src' directory to path to import modules
PROJECT_DIR = pathlib.Path.cwd().resolve().parent
sys.path.append(str(PROJECT_DIR))

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

DATA_DIR = PROJECT_DIR / 'data'
SCOPED_DATA_DIR = DATA_DIR / '03-scoped'
PROCESSED_DATA_DIR = DATA_DIR / '04-processed'

In [2]:
league = 'english-premier-league'
load_fp = SCOPED_DATA_DIR / league / str(league + '-scoped-1.csv')
seasons = ['2009-2010']


In [3]:
df_orig = pd.read_csv(load_fp, parse_dates = ['date'], index_col=None)
df_orig = df_orig[df_orig['season'].isin(seasons)]
# Calculate Results column
conditions = [df_orig['h_ftgoals'] > df_orig['a_ftgoals'],
              df_orig['h_ftgoals'] == df_orig['a_ftgoals'],
              df_orig['h_ftgoals'] < df_orig['a_ftgoals']]
choices = ['hwin', 'draw', 'awin']
df_orig['result'] = np.select(conditions, choices, default='not-played')
df_orig.rename(columns={'odds_hwin_bbmean': 'odds_hwin',
                      'odds_draw_bbmean': 'odds_draw',
                      'odds_awin_bbmean': 'odds_awin'}, inplace=True)
keeper_cols = ['season', 'date', 'h', 'a','h_shots', 'a_shots', 'result']#,
#               'odds_hwin', 'odds_draw', 'odds_awin']
df_orig = df_orig[keeper_cols]
df_orig.head()

Unnamed: 0,season,date,h,a,h_shots,a_shots,result
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,awin
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,awin
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,awin
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,hwin
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,awin


In [4]:
df_bc = df_orig.copy(deep=True)
df_bc.head()

Unnamed: 0,season,date,h,a,h_shots,a_shots,result
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,awin
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,awin
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,awin
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,hwin
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,awin


In [5]:
teams = df_bc['h'].unique()

### Putting Code Together

In [6]:
# Form hhaa Records DataFrame

def form_hhaa(df_bc):
# Get Dataframe by team game home and away
    home_team_home_games = []
    away_team_away_games = []
    for team in teams:
        team_home_game = df_bc[df_bc['h'] == team]
        home_team_home_games.append(team_home_game)

        team_away_game = df_bc[df_bc['a'] == team]
        away_team_away_games.append(team_away_game)

    # Form historical record dataframes for each team
    lags = range(1,len(home_team_home_games[0]))

    lagged_home_team_home_games = []
    lagged_away_team_away_games = []

    feature_cols = [col for col in df_bc.columns if col[0:2] == 'h_']
    for home_team_home_game in home_team_home_games:
        lagged_df = home_team_home_game.assign(**{f'h_{col}_-{n}': home_team_home_game[col].shift(n) \
                                                  for n in lags \
                                                  for col in feature_cols})
        lagged_home_team_home_games.append(lagged_df)

    feature_cols = [col for col in df_bc.columns if col[0:2] == 'a_']
    for away_team_away_game in away_team_away_games:
        lagged_df = away_team_away_game.assign(**{f'a_{col}_-{n}': away_team_away_game[col].shift(n) \
                                                  for n in lags \
                                                  for col in feature_cols})
        lagged_away_team_away_games.append(lagged_df)

    # Join DataFrames together
    hh_games = pd.concat([*lagged_home_team_home_games], axis=0, join='inner', sort=True)
    hh_games.sort_index(inplace=True)

    aa_games = pd.concat([*lagged_away_team_away_games], axis=0, join='inner', sort=True)
    aa_games.sort_index(inplace=True)

    hhaa_games = pd.concat([hh_games, aa_games], axis=1, join='inner', sort=True)
    hhaa_games.sort_index(inplace=True)
    #Drop duplicate columns such as h, a etc
    hhaa_games = hhaa_games.loc[:,~hhaa_games.columns.duplicated()]

    # Re-arrange column sequnce to more intuitive
    for col in reversed(['season', 'date', 'h', 'a', 'h_shots', 'a_shots']):
        ser = hhaa_games[col]
        hhaa_games.drop(labels= col, axis=1,inplace = True)
        hhaa_games.insert(0, col, ser)
    return hhaa_games

hhaa_games = form_hhaa(df_bc)
hhaa_games.head(10)

Unnamed: 0,season,date,h,a,h_shots,a_shots,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,result,a_a_shots_-1,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
5,2009-2010,2009-08-15,portsmouth,fulham,16.0,9.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
6,2009-2010,2009-08-15,stoke-city,burnley,12.0,9.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,
7,2009-2010,2009-08-15,wolverhampton-wanderers,west-ham-united,19.0,16.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,
8,2009-2010,2009-08-16,manchester-united,birmingham-city,26.0,6.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,
9,2009-2010,2009-08-16,tottenham-hotspur,liverpool,17.0,6.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,


### After Getting hhaa DataFrame, get ha, and ah

In [7]:
# Form hhaa Records DataFrame
# teams = hhaa_games['h'].unique()

# Get Dataframe by team game home and away
hhaa_team_games_dict = {}
#away_team_away_games = []
for team in teams:
    crit1 = hhaa_games['h'] == team
    crit2 = hhaa_games['a'] == team
    team_game = hhaa_games[crit1 | crit2]
    hhaa_team_games_dict[team] = team_game
    
hhaa_team_games_dict1 = hhaa_team_games_dict.copy()
hhaa_team_games_dict2 = hhaa_team_games_dict.copy()
#hhaa_team_games_dict[0].head(15)

In [8]:
ha_teams = []
# Get the indices for the team in question
for team, hhaa_set in hhaa_team_games_dict1.items():
    h_indices = hhaa_set[hhaa_set['h'] == team].index
    a_indices = hhaa_set[hhaa_set['a'] == team].index

# # we want to get the home team away record - designated h_a
# # This means we need to get the record based on the previos time the team played away
# # The indexes we want to add columns to are h_indices
# # print(h_indices)
# # The indexes of the times they played away are in a_indices
# # We want to form h_a, so we want to shift the a_indices data down to match the h indices
# # We want tyhe closest lower indices
    shifted_indices = [a_indices[a_indices < number].max() for number in h_indices]
# # print(shifted_indices)
# # We now have
# # team_games[0].loc[h_indices] = team_games[0].loc[shifted_indices][data]
# # We will form the data
# # We want to get the away records so we will grab anything that has got an a_a at the start
    data_cols = [col for col in hhaa_set.columns if col[0:2] == 'a_']
# # print(data_cols)
# # the column labeled a_shots is actually the previous away game, so we need to increment all the columns by 1
    numbered_cols = [col.rsplit('_',1)[0] + '_' + str(int(col.split('_')[-1])-1) for col in data_cols if not col.split('_')[-1].isalpha()]
# # print(numbered_cols)
# # Now deal with the missing -1 value
    new_minus_one = 'a_' + data_cols[0] + '_-1'
# # print(new_minus_one)
# # print(data_cols[0])
    numbered_cols.insert(0, new_minus_one)
# # print(numbered_cols)
# # Now we are getting h_a data, so we need to relabel the first a as a h
    renamed_cols = ['h' + col[1:] for col in numbered_cols]
# # print(renamed_cols)
    # ha_cols = list(hhaa_set.columns) + renamed_cols
# #print(all_cols)
# So now we can have a go at joining the data
    #hhaaha_set = hhaa_set.copy(deep=True)
    #hhaa_set = hhaa_set.reindex(renamed_cols, axis=1)
    ha_set = pd.DataFrame(index=h_indices, columns=renamed_cols)
#     print(ha_set)
    #hhaaha_set = hhaa_set.reindex(all_cols, axis=1)
#team_games[0].head()
#     print(hhaa_set)
    ha_set.loc[h_indices, renamed_cols] = hhaa_set.loc[shifted_indices][data_cols].values
    ha_teams.append(ha_set)
    #hhaa_set.head(15)
    #break
ha_teams[0].head(15)
#team_games[0].loc[shifted_indices, data_cols]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-l

Unnamed: 0,h_a_shots_-1,h_a_shots_-2,h_a_shots_-11,h_a_shots_-12,h_a_shots_-13,h_a_shots_-14,h_a_shots_-15,h_a_shots_-16,h_a_shots_-17,h_a_shots_-18,h_a_shots_-19,h_a_shots_-3,h_a_shots_-4,h_a_shots_-5,h_a_shots_-6,h_a_shots_-7,h_a_shots_-8,h_a_shots_-9,h_a_shots_-10
0,,,,,,,,,,,,,,,,,,,
33,7.0,,,,,,,,,,,,,,,,,,
47,12.0,7.0,,,,,,,,,,,,,,,,,
75,10.0,12.0,,,,,,,,,,7.0,,,,,,,
78,10.0,12.0,,,,,,,,,,7.0,,,,,,,
107,13.0,13.0,,,,,,,,,,12.0,10.0,12.0,7.0,,,,
129,10.0,13.0,,,,,,,,,,13.0,12.0,10.0,12.0,7.0,,,
139,10.0,13.0,,,,,,,,,,13.0,12.0,10.0,12.0,7.0,,,
168,11.0,8.0,,,,,,,,,,10.0,13.0,13.0,12.0,10.0,12.0,7.0,
192,7.0,11.0,,,,,,,,,,8.0,10.0,13.0,13.0,12.0,10.0,12.0,7.0


In [9]:
ha_df = pd.concat(ha_teams, axis=0).sort_index()
ha_df.tail()

Unnamed: 0,h_a_shots_-1,h_a_shots_-2,h_a_shots_-11,h_a_shots_-12,h_a_shots_-13,h_a_shots_-14,h_a_shots_-15,h_a_shots_-16,h_a_shots_-17,h_a_shots_-18,h_a_shots_-19,h_a_shots_-3,h_a_shots_-4,h_a_shots_-5,h_a_shots_-6,h_a_shots_-7,h_a_shots_-8,h_a_shots_-9,h_a_shots_-10
375,7.0,6.0,9.0,14.0,16.0,15.0,12.0,12.0,12.0,11.0,16.0,10.0,8.0,4.0,10.0,20.0,12.0,8.0,3.0
376,14.0,13.0,16.0,18.0,12.0,21.0,7.0,13.0,17.0,16.0,18.0,13.0,13.0,15.0,11.0,12.0,11.0,14.0,19.0
377,18.0,5.0,4.0,3.0,4.0,11.0,12.0,15.0,9.0,10.0,9.0,3.0,5.0,7.0,7.0,6.0,12.0,13.0,6.0
378,6.0,11.0,6.0,5.0,8.0,8.0,5.0,7.0,6.0,5.0,7.0,5.0,9.0,7.0,7.0,7.0,8.0,8.0,12.0
379,11.0,11.0,12.0,10.0,5.0,11.0,9.0,17.0,12.0,6.0,17.0,13.0,22.0,10.0,13.0,14.0,6.0,10.0,9.0


In [10]:
##!!!!!!!!!!!!!!!!!!!!!!! Change to make ah !!!!!!!!!!!!!!!!!!!!!!!
ah_teams = []
# Get the indices for the team in question
for team, hhaa_set in hhaa_team_games_dict1.items():
    h_indices = hhaa_set[hhaa_set['h'] == team].index
    a_indices = hhaa_set[hhaa_set['a'] == team].index

# # we want to get the home team away record - designated h_a
# # This means we need to get the record based on the previos time the team played away
# # The indexes we want to add columns to are h_indices
# # print(h_indices)
# # The indexes of the times they played away are in a_indices
# # We want to form h_a, so we want to shift the a_indices data down to match the h indices
# # We want tyhe closest lower indices
    shifted_indices = [h_indices[h_indices < number].max() for number in a_indices]
# # print(shifted_indices)
# # We now have
# # team_games[0].loc[h_indices] = team_games[0].loc[shifted_indices][data]
# # We will form the data
# # We want to get the away records so we will grab anything that has got an a_a at the start
    data_cols = [col for col in hhaa_set.columns if col[0:2] == 'h_']
# # print(data_cols)
# # the column labeled a_shots is actually the previous away game, so we need to increment all the columns by 1
    numbered_cols = [col.rsplit('_',1)[0] + '_' + str(int(col.split('_')[-1])-1) for col in data_cols if not col.split('_')[-1].isalpha()]
# # print(numbered_cols)
# # Now deal with the missing -1 value
    new_minus_one = 'h_' + data_cols[0] + '_-1'
# # print(new_minus_one)
# # print(data_cols[0])
    numbered_cols.insert(0, new_minus_one)
# # print(numbered_cols)
# # Now we are getting h_a data, so we need to relabel the first a as a h
    renamed_cols = ['a' + col[1:] for col in numbered_cols]
# # print(renamed_cols)
    # ha_cols = list(hhaa_set.columns) + renamed_cols
# #print(all_cols)
# So now we can have a go at joining the data
    #hhaaha_set = hhaa_set.copy(deep=True)
    #hhaa_set = hhaa_set.reindex(renamed_cols, axis=1)
    ah_set = pd.DataFrame(index=a_indices, columns=renamed_cols)
#     print(ha_set)
    #hhaaha_set = hhaa_set.reindex(all_cols, axis=1)
#team_games[0].head()
#     print(hhaa_set)
    ah_set.loc[a_indices, renamed_cols] = hhaa_set.loc[shifted_indices][data_cols].values
    ah_teams.append(ah_set)
    #hhaa_set.head(15)
    #break
ah_teams[0].head(15)
#team_games[0].loc[shifted_indices, data_cols]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-l

Unnamed: 0,a_h_shots_-1,a_h_shots_-2,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-19,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9,a_h_shots_-10
25,11.0,,,,,,,,,,,,,,,,,,
44,9.0,11.0,,,,,,,,,,,,,,,,,
57,7.0,9.0,,,,,,,,,,11.0,,,,,,,
90,7.0,12.0,,,,,,,,,,7.0,9.0,11.0,,,,,
99,7.0,12.0,,,,,,,,,,7.0,9.0,11.0,,,,,
106,7.0,12.0,,,,,,,,,,7.0,9.0,11.0,,,,,
123,19.0,7.0,,,,,,,,,,12.0,7.0,9.0,11.0,,,,
154,13.0,12.0,,,,,,,,,,19.0,7.0,12.0,7.0,9.0,11.0,,
162,13.0,12.0,,,,,,,,,,19.0,7.0,12.0,7.0,9.0,11.0,,
184,17.0,13.0,,,,,,,,,,12.0,19.0,7.0,12.0,7.0,9.0,11.0,


In [11]:
ah_df = pd.concat(ah_teams, axis=0).sort_index()
# ah_df['h'] = hhaa_
# hhaa_games.head(10)
ah_df.head()

Unnamed: 0,a_h_shots_-1,a_h_shots_-2,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-19,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9,a_h_shots_-10
0,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,


In [12]:
ah_df.tail()

Unnamed: 0,a_h_shots_-1,a_h_shots_-2,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-19,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9,a_h_shots_-10
375,10.0,17.0,12.0,11.0,13.0,17.0,13.0,20.0,20.0,10.0,16.0,8.0,17.0,10.0,12.0,4.0,8.0,13.0,12.0
376,12.0,11.0,12.0,14.0,9.0,10.0,11.0,2.0,4.0,13.0,12.0,9.0,8.0,13.0,4.0,8.0,15.0,6.0,11.0
377,10.0,16.0,15.0,16.0,5.0,17.0,13.0,10.0,7.0,8.0,4.0,11.0,11.0,12.0,12.0,12.0,11.0,6.0,10.0
378,7.0,17.0,12.0,6.0,10.0,27.0,11.0,22.0,27.0,21.0,18.0,18.0,25.0,23.0,12.0,9.0,13.0,13.0,14.0
379,12.0,16.0,14.0,9.0,20.0,21.0,23.0,16.0,21.0,11.0,16.0,6.0,10.0,18.0,11.0,15.0,12.0,14.0,10.0


In [13]:
df = pd.concat([hhaa_games, ah_df, ha_df], axis=1, sort=True).sort_index()#dropna(subset=['a_h_shots_-1', 'h_a_shots_-1'])
df.head(20)

Unnamed: 0,season,date,h,a,h_shots,a_shots,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,result,a_a_shots_-1,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9,a_h_shots_-1,a_h_shots_-2,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-19,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9,a_h_shots_-10,h_a_shots_-1,h_a_shots_-2,h_a_shots_-11,h_a_shots_-12,h_a_shots_-13,h_a_shots_-14,h_a_shots_-15,h_a_shots_-16,h_a_shots_-17,h_a_shots_-18,h_a_shots_-19,h_a_shots_-3,h_a_shots_-4,h_a_shots_-5,h_a_shots_-6,h_a_shots_-7,h_a_shots_-8,h_a_shots_-9,h_a_shots_-10
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,2009-2010,2009-08-15,portsmouth,fulham,16.0,9.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,2009-2010,2009-08-15,stoke-city,burnley,12.0,9.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,2009-2010,2009-08-15,wolverhampton-wanderers,west-ham-united,19.0,16.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,2009-2010,2009-08-16,manchester-united,birmingham-city,26.0,6.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,2009-2010,2009-08-16,tottenham-hotspur,liverpool,17.0,6.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
df.tail()

Unnamed: 0,season,date,h,a,h_shots,a_shots,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,result,a_a_shots_-1,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9,a_h_shots_-1,a_h_shots_-2,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-19,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9,a_h_shots_-10,h_a_shots_-1,h_a_shots_-2,h_a_shots_-11,h_a_shots_-12,h_a_shots_-13,h_a_shots_-14,h_a_shots_-15,h_a_shots_-16,h_a_shots_-17,h_a_shots_-18,h_a_shots_-19,h_a_shots_-3,h_a_shots_-4,h_a_shots_-5,h_a_shots_-6,h_a_shots_-7,h_a_shots_-8,h_a_shots_-9,h_a_shots_-10
375,2009-2010,2010-05-09,west-ham-united,manchester-city,12.0,17.0,9.0,6.0,11.0,9.0,16.0,14.0,15.0,20.0,8.0,17.0,10.0,17.0,17.0,18.0,20.0,13.0,7.0,13.0,draw,3.0,9.0,14.0,9.0,14.0,5.0,9.0,10.0,12.0,8.0,20.0,14.0,13.0,8.0,9.0,17.0,8.0,12.0,10.0,17.0,12.0,11.0,13.0,17.0,13.0,20.0,20.0,10.0,16.0,8.0,17.0,10.0,12.0,4.0,8.0,13.0,12.0,7.0,6.0,9.0,14.0,16.0,15.0,12.0,12.0,12.0,11.0,16.0,10.0,8.0,4.0,10.0,20.0,12.0,8.0,3.0
376,2009-2010,2010-05-09,manchester-united,stoke-city,18.0,4.0,15.0,12.0,16.0,18.0,20.0,13.0,17.0,21.0,10.0,26.0,9.0,10.0,33.0,12.0,21.0,25.0,24.0,23.0,hwin,5.0,13.0,5.0,7.0,7.0,8.0,8.0,10.0,13.0,6.0,3.0,9.0,12.0,8.0,14.0,10.0,4.0,7.0,12.0,11.0,12.0,14.0,9.0,10.0,11.0,2.0,4.0,13.0,12.0,9.0,8.0,13.0,4.0,8.0,15.0,6.0,11.0,14.0,13.0,16.0,18.0,12.0,21.0,7.0,13.0,17.0,16.0,18.0,13.0,13.0,15.0,11.0,12.0,11.0,14.0,19.0
377,2009-2010,2010-05-09,wolverhampton-wanderers,sunderland,14.0,11.0,14.0,12.0,12.0,8.0,8.0,8.0,11.0,12.0,24.0,19.0,10.0,9.0,12.0,11.0,8.0,6.0,9.0,11.0,hwin,10.0,7.0,8.0,9.0,14.0,14.0,4.0,8.0,9.0,20.0,9.0,7.0,10.0,6.0,7.0,5.0,7.0,13.0,10.0,16.0,15.0,16.0,5.0,17.0,13.0,10.0,7.0,8.0,4.0,11.0,11.0,12.0,12.0,12.0,11.0,6.0,10.0,18.0,5.0,4.0,3.0,4.0,11.0,12.0,15.0,9.0,10.0,9.0,3.0,5.0,7.0,7.0,6.0,12.0,13.0,6.0
378,2009-2010,2010-05-09,hull-city,liverpool,11.0,19.0,14.0,9.0,8.0,12.0,16.0,7.0,11.0,16.0,12.0,9.0,14.0,11.0,7.0,7.0,8.0,7.0,8.0,12.0,draw,12.0,7.0,15.0,9.0,6.0,15.0,12.0,19.0,26.0,6.0,16.0,4.0,11.0,6.0,9.0,6.0,4.0,12.0,7.0,17.0,12.0,6.0,10.0,27.0,11.0,22.0,27.0,21.0,18.0,18.0,25.0,23.0,12.0,9.0,13.0,13.0,14.0,6.0,11.0,6.0,5.0,8.0,8.0,5.0,7.0,6.0,5.0,7.0,5.0,9.0,7.0,7.0,7.0,8.0,8.0,12.0
379,2009-2010,2010-05-09,everton,portsmouth,21.0,10.0,15.0,19.0,15.0,12.0,9.0,13.0,24.0,18.0,26.0,8.0,10.0,18.0,18.0,11.0,7.0,11.0,13.0,24.0,hwin,13.0,7.0,9.0,14.0,9.0,8.0,12.0,17.0,9.0,9.0,6.0,8.0,7.0,13.0,7.0,12.0,10.0,11.0,12.0,16.0,14.0,9.0,20.0,21.0,23.0,16.0,21.0,11.0,16.0,6.0,10.0,18.0,11.0,15.0,12.0,14.0,10.0,11.0,11.0,12.0,10.0,5.0,11.0,9.0,17.0,12.0,6.0,17.0,13.0,22.0,10.0,13.0,14.0,6.0,10.0,9.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380 entries, 0 to 379
Data columns (total 81 columns):
season           380 non-null object
date             380 non-null datetime64[ns]
h                380 non-null object
a                380 non-null object
h_shots          380 non-null float64
a_shots          380 non-null float64
h_h_shots_-1     360 non-null float64
h_h_shots_-10    180 non-null float64
h_h_shots_-11    160 non-null float64
h_h_shots_-12    140 non-null float64
h_h_shots_-13    120 non-null float64
h_h_shots_-14    100 non-null float64
h_h_shots_-15    80 non-null float64
h_h_shots_-16    60 non-null float64
h_h_shots_-17    40 non-null float64
h_h_shots_-18    20 non-null float64
h_h_shots_-2     340 non-null float64
h_h_shots_-3     320 non-null float64
h_h_shots_-4     300 non-null float64
h_h_shots_-5     280 non-null float64
h_h_shots_-6     260 non-null float64
h_h_shots_-7     240 non-null float64
h_h_shots_-8     220 non-null float64
h_h_shots_-9     200 

In [18]:
save_fp = PROCESSED_DATA_DIR / league / seasons[0] / str(league + '--' + seasons[0] + '.csv')
df.to_csv(save_fp, index=False)

In [19]:
df_load = pd.read_csv(save_fp, parse_dates = ['date'], index_col=None)
df_load.head()

Unnamed: 0,season,date,h,a,h_shots,a_shots,h_h_shots_-1,h_h_shots_-10,h_h_shots_-11,h_h_shots_-12,h_h_shots_-13,h_h_shots_-14,h_h_shots_-15,h_h_shots_-16,h_h_shots_-17,h_h_shots_-18,h_h_shots_-2,h_h_shots_-3,h_h_shots_-4,h_h_shots_-5,h_h_shots_-6,h_h_shots_-7,h_h_shots_-8,h_h_shots_-9,result,a_a_shots_-1,a_a_shots_-10,a_a_shots_-11,a_a_shots_-12,a_a_shots_-13,a_a_shots_-14,a_a_shots_-15,a_a_shots_-16,a_a_shots_-17,a_a_shots_-18,a_a_shots_-2,a_a_shots_-3,a_a_shots_-4,a_a_shots_-5,a_a_shots_-6,a_a_shots_-7,a_a_shots_-8,a_a_shots_-9,a_h_shots_-1,a_h_shots_-2,a_h_shots_-11,a_h_shots_-12,a_h_shots_-13,a_h_shots_-14,a_h_shots_-15,a_h_shots_-16,a_h_shots_-17,a_h_shots_-18,a_h_shots_-19,a_h_shots_-3,a_h_shots_-4,a_h_shots_-5,a_h_shots_-6,a_h_shots_-7,a_h_shots_-8,a_h_shots_-9,a_h_shots_-10,h_a_shots_-1,h_a_shots_-2,h_a_shots_-11,h_a_shots_-12,h_a_shots_-13,h_a_shots_-14,h_a_shots_-15,h_a_shots_-16,h_a_shots_-17,h_a_shots_-18,h_a_shots_-19,h_a_shots_-3,h_a_shots_-4,h_a_shots_-5,h_a_shots_-6,h_a_shots_-7,h_a_shots_-8,h_a_shots_-9,h_a_shots_-10
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,11.0,14.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,17.0,8.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,11.0,20.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2009-2010,2009-08-15,chelsea,hull-city,26.0,7.0,,,,,,,,,,,,,,,,,,,hwin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2009-2010,2009-08-15,everton,arsenal,8.0,15.0,,,,,,,,,,,,,,,,,,,awin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [20]:
df_load.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 81 columns):
season           380 non-null object
date             380 non-null datetime64[ns]
h                380 non-null object
a                380 non-null object
h_shots          380 non-null float64
a_shots          380 non-null float64
h_h_shots_-1     360 non-null float64
h_h_shots_-10    180 non-null float64
h_h_shots_-11    160 non-null float64
h_h_shots_-12    140 non-null float64
h_h_shots_-13    120 non-null float64
h_h_shots_-14    100 non-null float64
h_h_shots_-15    80 non-null float64
h_h_shots_-16    60 non-null float64
h_h_shots_-17    40 non-null float64
h_h_shots_-18    20 non-null float64
h_h_shots_-2     340 non-null float64
h_h_shots_-3     320 non-null float64
h_h_shots_-4     300 non-null float64
h_h_shots_-5     280 non-null float64
h_h_shots_-6     260 non-null float64
h_h_shots_-7     240 non-null float64
h_h_shots_-8     220 non-null float64
h_h_shots_-9     200 

## Functions to Cut Data to Defined Shape prior to PreProcessing

In [30]:
import re

def get_n_past_games(df_orig, past_games=2, dropna_rows=True):
    """
    Restrict historical records to the last n games
    If n = 2, then will get
    home team
        last 2 games played at home ie. h_h_feature-1, h_h_feature-2
        last 2 games played away ie h_a_feature-1, h_a_feature-2
    away team
        last 2 games played at home ie a_h_feature-1, a_h_feature-2
        last 2 games played away ie a_a_feature-1, a_a_feature-2
    where -1 is the most recent game prior to the current game, and -2 is the game before that
    the current games result is in result
    """
    df = df_orig.copy(deep=True)
    keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
               and int(col.split('_')[-1]) > -(past_games+1) \
               or col.isalpha() \
               or 'odds' in col]
    keeper_cols = keeper_cols + ['result']#, 'h_ftgoals', 'a_ftgoals']
    
#     # Rename the target columns to start with prefix 'res_'
#     res_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
#                 and int(col.split('_')[-1]) == 0]
#     res_cols.append('result')
#     res_cols = [col.replace('0', 'res') for col in res_cols]
#     print(res_cols)
#     print(res_cols)
    df = df[keeper_cols]
    return df

df_load = pd.read_csv(save_fp, parse_dates = ['date'], index_col=None) 
df_cut = get_n_past_games(df_load)
# # Restrict Historical records to last 4 games per team
# df_cut = df_load.copy(deep=True)
# hist_games = 2
# keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
#                and int(col.split('_')[-1]) > -(hist_games+1) \
#                and int(col.split('_')[-1]) < 0 \
#                or col.isalpha() \
#                or 'odds' in col]
# df_cut = df_cut[keeper_cols]
df_cut.head()

Unnamed: 0,season,date,h,a,h_h_shots_-1,h_h_shots_-2,result,a_a_shots_-1,a_a_shots_-2,a_h_shots_-1,a_h_shots_-2,h_a_shots_-1,h_a_shots_-2,result.1
0,2009-2010,2009-08-15,aston-villa,wigan-athletic,,,awin,,,,,,,awin
1,2009-2010,2009-08-15,blackburn-rovers,manchester-city,,,awin,,,,,,,awin
2,2009-2010,2009-08-15,bolton-wanderers,sunderland,,,awin,,,,,,,awin
3,2009-2010,2009-08-15,chelsea,hull-city,,,hwin,,,,,,,hwin
4,2009-2010,2009-08-15,everton,arsenal,,,awin,,,,,,,awin


In [31]:
df_cut.tail()

Unnamed: 0,season,date,h,a,h_h_shots_-1,h_h_shots_-2,result,a_a_shots_-1,a_a_shots_-2,a_h_shots_-1,a_h_shots_-2,h_a_shots_-1,h_a_shots_-2,result.1
375,2009-2010,2010-05-09,west-ham-united,manchester-city,9.0,10.0,draw,3.0,20.0,10.0,17.0,7.0,6.0,draw
376,2009-2010,2010-05-09,manchester-united,stoke-city,15.0,9.0,hwin,5.0,3.0,12.0,11.0,14.0,13.0,hwin
377,2009-2010,2010-05-09,wolverhampton-wanderers,sunderland,14.0,10.0,hwin,10.0,9.0,10.0,16.0,18.0,5.0,hwin
378,2009-2010,2010-05-09,hull-city,liverpool,14.0,14.0,draw,12.0,16.0,7.0,17.0,6.0,11.0,draw
379,2009-2010,2010-05-09,everton,portsmouth,15.0,10.0,hwin,13.0,6.0,12.0,16.0,11.0,11.0,hwin


In [None]:
import re

def get_n_past_games(df_orig, past_games=2, dropna_rows=True):
    """
    Restrict historical records to the last n games
    If n = 2, then will get
    home team
        last 2 games played at home ie. h_h_feature-1, h_h_feature-2
        last 2 games played away ie h_a_feature-1, h_a_feature-2
    away team
        last 2 games played at home ie a_h_feature-1, a_h_feature-2
        last 2 games played away ie a_a_feature-1, a_a_feature-2
    where -1 is the most recent game prior to the current game, and -2 is the game before that
    the current games result is in result
    """
    df = df_orig.copy(deep=True)
    keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
               and int(col.split('_')[-1]) > -(hist_games+1) \
               and int(col.split('_')[-1]) < 0 \
               or col.isalpha() \
               or 'odds' in col]
df_cut = df_cut[keeper_cols]
    

# Restrict Historical records to last 4 games per team
df_cut = df_load.copy(deep=True)
hist_games = 2
keeper_cols = [col for col in df.columns if not col.split('_')[-1].isalpha() \
               and int(col.split('_')[-1]) > -(hist_games+1) \
               and int(col.split('_')[-1]) < 0 \
               or col.isalpha() \
               or 'odds' in col]
df_cut = df_cut[keeper_cols]
df_cut.head(10)