In [225]:
# Import libraries
import pandas as pd
import os
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt

DATA_PATH = 'data/'

In [237]:
def create_df(path):
	df = (pd.read_csv(path)
		.assign(Date=lambda df: pd.to_datetime(df.Date))
		.dropna(axis=0)  # Drop rows with NAs
		.sort_values('Date')
		.reset_index(drop=True)
		.assign(Id=lambda df: list(df.index + 1)))
	return df

In [238]:
df = create_df(os.path.join(DATA_PATH, 'season0708.csv'))
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,GBH,GBD,GBA,IWH,IWD,IWA,LBH,LBD,LBA,SBH,SBD,SBA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,BSH,BSD,BSA,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,Id
0,E0,2007-01-12,Blackburn,Newcastle,3,1,H,0,0,D,M Atkinson,13,20,6,8,21,17,1,1,2,3,0,0,1.9,3.4,4.2,1.9,3.2,3.8,1.9,3.3,4.0,2.0,3.2,3.3,1.72,3.2,4.33,1.9,3.25,3.75,1.83,3.2,3.75,1.8,3.2,4.33,1.9,3.25,4.0,1.8,3.25,4.2,38,1.95,1.88,3.4,3.27,4.5,4.06,32,2.08,1.93,1.9,1.81,20,-0.5,1.97,1.94,2.01,1.96,1
1,E0,2007-01-12,Wigan,Man City,1,1,D,1,1,D,M Riley,5,4,4,1,13,10,2,4,2,2,1,0,3.0,3.25,2.4,3.25,3.15,2.1,3.1,3.2,2.25,2.8,3.1,2.3,3.0,3.2,2.1,3.0,3.2,2.2,2.8,3.1,2.25,3.0,3.2,2.2,3.0,3.25,2.25,2.88,3.25,2.25,37,3.25,3.01,3.25,3.17,2.4,2.28,31,2.27,2.05,1.73,1.68,19,0.25,1.83,1.8,2.16,2.08,2
2,E0,2007-01-12,Sunderland,Derby,1,0,H,0,0,D,M Halsey,20,12,9,9,12,15,5,3,1,1,0,0,1.66,3.6,5.5,1.65,3.4,5.0,1.68,3.5,5.0,1.7,3.4,4.3,1.66,3.3,4.5,1.65,3.5,4.75,1.62,3.3,5.0,1.67,3.25,5.0,1.65,3.5,5.0,1.67,3.5,4.5,39,1.76,1.66,3.7,3.47,6.0,5.16,33,2.1,1.95,1.85,1.79,21,-0.75,2.04,1.97,1.96,1.91,3
3,E0,2007-01-12,Reading,Middlesbrough,1,1,D,0,0,D,A Wiley,12,16,7,11,11,16,5,4,2,1,0,0,2.0,3.3,4.0,1.95,3.12,3.8,2.0,3.25,3.75,2.0,3.2,3.3,1.83,3.2,3.75,1.9,3.25,3.75,1.8,3.25,3.8,1.91,3.25,3.75,1.9,3.25,4.0,1.91,3.2,3.75,38,2.1,1.97,3.35,3.23,4.2,3.77,32,2.01,1.9,1.97,1.84,20,-0.5,2.08,2.03,1.91,1.85,4
4,E0,2007-01-12,Portsmouth,Everton,0,0,D,0,0,D,P Walton,20,4,6,2,14,13,9,0,1,2,0,0,2.2,3.25,3.4,2.1,3.15,3.25,2.25,3.2,3.1,2.1,3.1,3.2,2.1,3.2,3.0,2.1,3.2,3.2,2.0,3.1,3.3,2.1,3.2,3.2,2.1,3.2,3.4,2.1,3.2,3.2,38,2.31,2.17,3.3,3.21,3.5,3.21,32,2.15,2.01,1.75,1.71,20,-0.25,1.99,1.94,1.98,1.95,5


In [239]:
# Define a function which restructures our DataFrame
def create_multiline_df(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'HomeTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'B365H', 'B365D', 'B365A']
    
    away_stats_cols = ['Date', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'B365H', 'B365D', 'B365A']
    
    stats_cols_mapping = ['Date', 'Team', 'GoalsFor', 'GoalsAgainst', 'Result', 'ShotsFor', 'ShotsAgainst', 
                          'HomeOdds', 'DrawOdds', 'AwayOdds']
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['Id'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(HomeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['Id'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(HomeGame=0), sort=True)
                    .sort_values(by='Id') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [240]:
# Define a function which creates an EMA DataFrame from the stats DataFrame
def create_stats_features_ema(stats, span):
    # Create restructured dataframe so we can calculate EMA
    multiline_stats = create_multiline_df(stats)
    # Create a copy of the DataFrame
    ema_features = multiline_stats[['Date', 'Id', 'Team', 'Result', 'HomeOdds', 
                          'DrawOdds', 'AwayOdds', 'HomeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multiline_stats.drop(columns=['Date', 'Id', 'Team', 'Result', 
                                        'HomeOdds', 'DrawOdds', 
                                        'AwayOdds', 'HomeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multiline_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

# Then we add our weighted average to each row
df = create_stats_features_ema(df, 5)
df.tail(10)

Unnamed: 0,Date,Id,Team,Result,HomeOdds,DrawOdds,AwayOdds,HomeGame,GoalsAgainst,GoalsFor,ShotsAgainst,ShotsFor
536,2008-12-04,269,West Ham,H,2.1,3.4,3.5,0,1.05805,2.142355,9.602396,14.905801
537,2008-12-04,269,Bolton,H,2.1,3.4,3.5,1,0.875958,1.213653,7.547952,17.475556
538,2008-12-04,270,Aston Villa,A,6.0,4.0,1.57,0,1.108883,2.301887,8.79476,10.835318
539,2008-12-04,270,Derby,A,6.0,4.0,1.57,1,1.796987,2.29804,9.117437,9.448525
540,2008-12-04,271,Newcastle,D,2.2,3.3,3.4,0,0.640129,3.832199,6.757282,20.574623
541,2008-12-04,271,Portsmouth,D,2.2,3.3,3.4,1,1.255045,1.634899,9.074099,14.38914
542,2008-12-04,272,Fulham,A,1.9,3.4,4.33,0,1.095222,1.162731,9.171321,12.84519
543,2008-12-04,272,Reading,A,1.9,3.4,4.33,1,1.268756,1.653676,9.290036,11.605683
544,2008-12-04,273,Sunderland,A,2.2,3.3,3.4,1,1.046948,0.785437,11.572209,11.287799
545,2008-12-04,273,Man City,A,2.2,3.3,3.4,0,0.851621,2.708234,8.216533,14.438162


In [221]:
# def restructure_stats_features(stats_features):
#     non_features = ['HomeGame', 'Team', 'Id']

#     stats_features_restructured = (stats_features.query('HomeGame == 1')
#                                     .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
#                                     .rename(columns={'Team': 'HomeTeam'})
#                                     .pipe(pd.merge, (stats_features.query('HomeGame == 0')
#                                                         .rename(columns={'Team': 'AwayTeam'})
#                                                         .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
#                                                                          if col not in non_features})), on=['Id'])
#                                     .pipe(pd.merge, df[['Id', 'Result']], on='Id')
#                                     .dropna())
#     return stats_features_restructured


# df = restructure_stats_features(df)
# df.head()

In [243]:
def restructure_stats_features(stats_features):
    non_features = ['HomeGame', 'Team', 'Id']

    stats_features_restructured = (stats_features.query('HomeGame == 1')
                                    .rename(columns={col: col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('HomeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['Id'])
                                    .pipe(pd.merge, df[['Id', 'Result']], on='Id')
                                    .dropna()
                                    .drop_duplicates())
    return stats_features_restructured

df = restructure_stats_features(df).head()

UndefinedVariableError: name 'HomeGame' is not defined

In [244]:
df.columns

Index(['DateHome', 'Id', 'HomeTeam', 'ResultHome', 'HomeOddsHome',
       'DrawOddsHome', 'AwayOddsHome', 'HomeGame_x', 'GoalsAgainstHome',
       'GoalsForHome', 'ShotsAgainstHome', 'ShotsForHome', 'DateAway',
       'AwayTeam', 'ResultAway', 'HomeOddsAway', 'DrawOddsAway',
       'AwayOddsAway', 'HomeGame_y', 'GoalsAgainstAway', 'GoalsForAway',
       'ShotsAgainstAway', 'ShotsForAway', 'Result'],
      dtype='object')

In [224]:
df.head()

Unnamed: 0,f_DateHome,Id,HomeTeam,f_ResultHome,f_HomeOddsHome,f_DrawOddsHome,f_AwayOddsHome,HomeGame_x,f_GoalsAgainstHome,f_GoalsForHome,f_ShotsAgainstHome,f_ShotsForHome,f_DateAway,AwayTeam,f_ResultAway,f_HomeOddsAway,f_DrawOddsAway,f_AwayOddsAway,HomeGame_y,f_GoalsAgainstAway,f_GoalsForAway,f_ShotsAgainstAway,f_ShotsForAway,Result
40,2007-05-12,21,Newcastle,D,6.0,3.75,1.61,1,2.8,1.8,14.0,13.6,2007-05-12,Arsenal,D,6.0,3.75,1.61,0,2.0,1.6,11.6,14.0,D
42,2007-08-12,22,Chelsea,H,1.16,6.5,21.0,1,1.2,0.4,9.4,11.6,2007-08-12,Sunderland,H,1.16,6.5,21.0,0,0.0,1.0,9.6,12.2,H
44,2007-08-12,23,Man United,H,1.11,8.5,26.0,1,0.8,2.0,11.2,15.4,2007-08-12,Derby,H,1.11,8.5,26.0,0,0.0,1.6,9.6,17.0,H
46,2007-08-12,24,Newcastle,H,1.8,3.5,4.75,1,1.947368,1.421053,10.210526,12.842105,2007-08-12,Birmingham,H,1.8,3.5,4.75,0,1.8,2.6,7.0,16.4,H
48,2007-08-12,25,Reading,H,5.5,3.6,1.66,1,1.0,2.2,14.2,10.2,2007-08-12,Liverpool,H,5.5,3.6,1.66,0,0.0,1.6,9.0,18.2,H


In [245]:
df = df[['DateHome', 'Id', 'HomeTeam', 'HomeOddsHome',
       'DrawOddsHome', 'AwayOddsHome', 'GoalsAgainstHome',
       'GoalsForHome', 'ShotsAgainstHome', 'ShotsForHome',
       'AwayTeam', 'GoalsAgainstAway', 'GoalsForAway',
       'ShotsAgainstAway', 'ShotsForAway', 'Result']]

In [246]:
df.columns

Index(['DateHome', 'Id', 'HomeTeam', 'HomeOddsHome', 'DrawOddsHome',
       'AwayOddsHome', 'GoalsAgainstHome', 'GoalsForHome', 'ShotsAgainstHome',
       'ShotsForHome', 'AwayTeam', 'GoalsAgainstAway', 'GoalsForAway',
       'ShotsAgainstAway', 'ShotsForAway', 'Result'],
      dtype='object')