In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import log_loss
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import ensemble, metrics, model_selection as ms, preprocessing, tree
from sklearn import linear_model as lm
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.set_option('display.max_columns', 20)
from sklearn.pipeline import Pipeline
from os import listdir
from os.path import isfile, join

In [2]:
data_folder = "C:/Users/Byron/Documents/Football Predictions/UnderstatPredictions/Datasets"

In [3]:
#read in files
results_df = pd.read_csv(data_folder+'/UnderstatResultsAll.csv') #contains 2014 - 2018 team stats

In [4]:
results_df.head()

Unnamed: 0,league,season,matchid,matchdate,fixture,hometeam,awayteam,homegoals,awaygoals,hxG,axG,season_id,league_teams_count,season_gw,HomePoints,AwayPoints
0,Bundesliga,2014/15,5447,2014-08-22 19:30:00,Bayern Munich vs Wolfsburg,Bayern Munich,Wolfsburg,2.0,1.0,2.57012,1.19842,1,18,1,3,0
1,Bundesliga,2014/15,5448,2014-08-23 14:30:00,Hoffenheim vs Augsburg,Hoffenheim,Augsburg,2.0,0.0,1.52873,0.280777,2,18,1,3,0
2,Bundesliga,2014/15,5449,2014-08-23 14:30:00,Hannover 96 vs Schalke 04,Hannover 96,Schalke 04,2.0,1.0,1.17979,0.956665,3,18,1,3,0
3,Bundesliga,2014/15,5450,2014-08-23 14:30:00,Hertha Berlin vs Werder Bremen,Hertha Berlin,Werder Bremen,2.0,2.0,1.75585,1.19453,4,18,1,1,1
4,Bundesliga,2014/15,5451,2014-08-23 14:30:00,Eintracht Frankfurt vs Freiburg,Eintracht Frankfurt,Freiburg,1.0,0.0,1.75331,1.38084,5,18,1,3,0


## Restructure Fixtures Data into Teams Data
Separate home and away values for each respective team

In [5]:
def restructure_home(main_df, hometeam,awayteam,homegoals,awaygoals,xg_home, xg_away,points):
    new_df = main_df.copy()
    new_df = new_df.rename(columns={hometeam: "Team"
                                   ,awayteam: "Opponent"
                                   ,homegoals: "GoalsFor"
                                   ,awaygoals: "GoalsAgainst"
                                   ,xg_home: "xG"
                                   ,xg_away: "xGA"
                                   ,points: "Points"})
    new_df['HomeGame']=1
    return new_df

def restructure_away(main_df, hometeam,awayteam,homegoals,awaygoals,xg_home, xg_away,points):
    new_df = main_df.copy()
    new_df = new_df.rename(columns={awayteam: "Team"
                                   ,hometeam: "Opponent"
                                   ,awaygoals: "GoalsFor"
                                   ,homegoals: "GoalsAgainst"
                                   ,xg_away: "xG"
                                   ,xg_home: "xGA"
                                   ,points: "Points"})
    new_df['HomeGame']=0
    return new_df



In [110]:
#create teams DF
home_df = restructure_home(results_df,"hometeam","awayteam","homegoals","awaygoals","hxG","axG","HomePoints")
away_df = restructure_away(results_df,"hometeam","awayteam","homegoals","awaygoals","hxG","axG","HomePoints")

column_order = list(home_df.columns)
away_df = away_df[column_order]
teams_df = home_df.append(away_df)
teams_df.head()

Unnamed: 0,league,season,matchid,matchdate,fixture,Team,Opponent,GoalsFor,GoalsAgainst,xG,xGA,season_id,league_teams_count,season_gw,Points,AwayPoints,HomeGame
0,Bundesliga,2014/15,5447,2014-08-22 19:30:00,Bayern Munich vs Wolfsburg,Bayern Munich,Wolfsburg,2.0,1.0,2.57012,1.19842,1,18,1,3,0,1
1,Bundesliga,2014/15,5448,2014-08-23 14:30:00,Hoffenheim vs Augsburg,Hoffenheim,Augsburg,2.0,0.0,1.52873,0.280777,2,18,1,3,0,1
2,Bundesliga,2014/15,5449,2014-08-23 14:30:00,Hannover 96 vs Schalke 04,Hannover 96,Schalke 04,2.0,1.0,1.17979,0.956665,3,18,1,3,0,1
3,Bundesliga,2014/15,5450,2014-08-23 14:30:00,Hertha Berlin vs Werder Bremen,Hertha Berlin,Werder Bremen,2.0,2.0,1.75585,1.19453,4,18,1,1,1,1
4,Bundesliga,2014/15,5451,2014-08-23 14:30:00,Eintracht Frankfurt vs Freiburg,Eintracht Frankfurt,Freiburg,1.0,0.0,1.75331,1.38084,5,18,1,3,0,1


## Sort Data by Season and Team in order of fixtures

In [111]:
teams_df.sort_values(by=['league','season','Team','matchdate'],inplace = True)
teams_df.reset_index(inplace = True,drop = True)

In [112]:
#crete order of fixtures for each team
teams_df['season_team_id']= teams_df.groupby(['league','season','Team']).cumcount()+1

In [113]:
teams_df.head()

Unnamed: 0,league,season,matchid,matchdate,fixture,Team,Opponent,GoalsFor,GoalsAgainst,xG,xGA,season_id,league_teams_count,season_gw,Points,AwayPoints,HomeGame,season_team_id
0,Bundesliga,2014/15,5448,2014-08-23 14:30:00,Hoffenheim vs Augsburg,Augsburg,Hoffenheim,0.0,2.0,0.280777,1.52873,2,18,1,3,0,0,1
1,Bundesliga,2014/15,5456,2014-08-29 19:30:00,Augsburg vs Borussia Dortmund,Augsburg,Borussia Dortmund,2.0,3.0,1.00422,2.30342,10,18,2,0,3,1,2
2,Bundesliga,2014/15,5418,2014-09-14 14:30:00,Eintracht Frankfurt vs Augsburg,Augsburg,Eintracht Frankfurt,1.0,0.0,1.00028,0.845902,26,18,3,0,3,0,3
3,Bundesliga,2014/15,5425,2014-09-20 14:30:00,Augsburg vs Werder Bremen,Augsburg,Werder Bremen,4.0,2.0,2.91201,2.02462,33,18,4,3,0,1,4
4,Bundesliga,2014/15,5434,2014-09-24 19:00:00,Bayer Leverkusen vs Augsburg,Augsburg,Bayer Leverkusen,0.0,1.0,0.571159,1.65235,42,18,5,3,0,0,5


In [77]:
teams_df_qa = teams_df.groupby(["league","season","season_gw"]).agg({"fixture":"nunique"
                                                      ,"Team":"nunique"}).reset_index()
teams_df_qa[teams_df_qa['league']=='La Liga']

Unnamed: 0,league,season,season_gw,fixture,Team
432,La Liga,2014/15,1,10,20
433,La Liga,2014/15,2,10,20
434,La Liga,2014/15,3,10,20
435,La Liga,2014/15,4,10,20
436,La Liga,2014/15,5,10,20
...,...,...,...,...,...
655,La Liga,2019/20,34,10,20
656,La Liga,2019/20,35,10,20
657,La Liga,2019/20,36,10,20
658,La Liga,2019/20,37,10,20


In [78]:
teams_df[(teams_df['league']=="Bundesliga") &
         (teams_df['season_gw']==1) &
        (teams_df['season']=='2014/15')]

Unnamed: 0,league,season,matchid,matchdate,fixture,Team,Opponent,GoalsFor,GoalsAgainst,xG,xGA,season_id,league_teams_count,season_gw,Points,AwayPoints,HomeGame,season_team_id
0,Bundesliga,2014/15,5448,2014-08-23 14:30:00,Hoffenheim vs Augsburg,Augsburg,Hoffenheim,0.0,2.0,0.280777,1.52873,2,18,1,3,0,0,1
34,Bundesliga,2014/15,5453,2014-08-23 17:30:00,Borussia Dortmund vs Bayer Leverkusen,Bayer Leverkusen,Borussia Dortmund,2.0,0.0,1.14915,0.482324,7,18,1,0,3,0,1
68,Bundesliga,2014/15,5447,2014-08-22 19:30:00,Bayern Munich vs Wolfsburg,Bayern Munich,Wolfsburg,2.0,1.0,2.57012,1.19842,1,18,1,3,0,1,1
102,Bundesliga,2014/15,5453,2014-08-23 17:30:00,Borussia Dortmund vs Bayer Leverkusen,Borussia Dortmund,Bayer Leverkusen,0.0,2.0,0.482324,1.14915,7,18,1,0,3,1,1
136,Bundesliga,2014/15,5455,2014-08-24 16:30:00,Borussia M.Gladbach vs VfB Stuttgart,Borussia M.Gladbach,VfB Stuttgart,1.0,1.0,2.40939,0.433199,9,18,1,1,1,1,1
170,Bundesliga,2014/15,5451,2014-08-23 14:30:00,Eintracht Frankfurt vs Freiburg,Eintracht Frankfurt,Freiburg,1.0,0.0,1.75331,1.38084,5,18,1,3,0,1,1
204,Bundesliga,2014/15,5452,2014-08-23 14:30:00,FC Cologne vs Hamburger SV,FC Cologne,Hamburger SV,0.0,0.0,0.522276,0.913337,6,18,1,1,1,1,1
238,Bundesliga,2014/15,5451,2014-08-23 14:30:00,Eintracht Frankfurt vs Freiburg,Freiburg,Eintracht Frankfurt,0.0,1.0,1.38084,1.75331,5,18,1,3,0,0,1
272,Bundesliga,2014/15,5452,2014-08-23 14:30:00,FC Cologne vs Hamburger SV,Hamburger SV,FC Cologne,0.0,0.0,0.913337,0.522276,6,18,1,1,1,0,1
306,Bundesliga,2014/15,5449,2014-08-23 14:30:00,Hannover 96 vs Schalke 04,Hannover 96,Schalke 04,2.0,1.0,1.17979,0.956665,3,18,1,3,0,1,1


## Create moving averages using different techniques
Create exponentially weighted moving averages - one variation that resets each season and one variation that doesn't. <br>
Create moving averages - one variation that resets each season and one variation that doesn't.

In [114]:
def ema_no_reset(stats_df,span,feature_list):
    '''
    Calculates an exponential moving average for each team for the time span required.
    Does not reset after each season
    '''
    new_df = stats_df.copy()
    for feature_name in feature_list:
        new_feature_name = feature_name+"_av"
        #print(feature_name)
        feature_ema = (stats_df.groupby(["Team"])[feature_name]  # Calculate the EMA
                       .transform(lambda row: row.ewm(span=span, min_periods=2)
                                  .mean()
                                  .shift(1)
                                 ))
        new_df[new_feature_name] = feature_ema
    return new_df

def ema_season_reset(stats_df,span,feature_list):
    '''
    Calculates an exponential moving average for each team for the time span required.
    reset after each season
    '''
    new_df = stats_df.copy()
    for feature_name in feature_list:
        new_feature_name = feature_name+"_av"
        #print(feature_name)
        feature_ema = (stats_df.groupby(["Team",'season'])[feature_name]  # Calculate the EMA
                       .transform(lambda row: row.ewm(span=span, min_periods=2)
                                  .mean()
                                  .shift(1)
                                 ))
        new_df[new_feature_name] = feature_ema
    return new_df

def moving_average_season_reset(stats_df,span,feature_list):
    '''
    Calculates an exponential moving average for each team for the time span required.
    Does not reset after each season
    '''
    new_df = stats_df.copy()
    for feature_name in feature_list:
        new_feature_name = feature_name+"_av"
        #print(feature_name)
        feature_ema = (stats_df.groupby(["Team",'season'])[feature_name]  # Calculate the EMA
                       .transform(lambda row: row.rolling(window=span, min_periods=2)
                                  .mean()
                                  .shift(1)
                                 ))
        new_df[new_feature_name] = feature_ema
    return new_df

def moving_average_no_reset(stats_df,span,feature_list):
    '''
    Calculates an exponential moving average for each team for the time span required.
    Does not reset after each season
    '''
    new_df = stats_df.copy()
    for feature_name in feature_list:
        new_feature_name = feature_name+"_av"
        #print(feature_name)
        feature_ema = (stats_df.groupby(["Team"])[feature_name]  # Calculate the EMA
                       .transform(lambda row: row.rolling(window=span, min_periods=2)
                                  .mean()
                                  .shift(1)
                                 ))
        new_df[new_feature_name] = feature_ema
    return new_df

## Create leaguewide moving averages

In [115]:
def season_average(stats_df,feature_list):
    new_df =pd.DataFrame()
    '''
    Calculates an exponential moving average for each team for the time span required.
    Requires a moving average funciton to have been run first which determines whether or not there was a reset
    '''
    for feature_name in feature_list:
        new_feature_name = feature_name+"_gw"
        #print(feature_name)
        feature_ema = (stats_df.groupby(["league","season","season_gw"])[feature_name]  # Calculate the EMA
                       .transform(lambda row: row.mean()
#                                   .shift(1)
                                 ))
        new_df[new_feature_name] = feature_ema
    return new_df

In [126]:
ema_features =["xG","xGA","GoalsFor","GoalsAgainst"]
xg_ema = ema_no_reset(teams_df,36,ema_features)
xg_ema['AverageMethod'] = 'EMA No reset'

#ema season reset
xg_ema_reset = ema_season_reset(teams_df,36,ema_features)
xg_ema_reset['AverageMethod'] = 'EMA Season reset'

#moving average no reset
xg_ma = moving_average_no_reset(teams_df,36,ema_features)
xg_ma['AverageMethod'] = 'Moving Average No reset'

#moving average season reset
xg_ma_reset = moving_average_season_reset(teams_df,36,ema_features)
xg_ma_reset['AverageMethod'] = 'Moving Average Season reset'

#calculate gameweek averages
gw_features = ["xG_av","xGA_av","GoalsFor_av","GoalsAgainst_av"]
season_average_no_reset = season_average(xg_ma,gw_features)#no reset
season_average_season_reset = season_average(xg_ma_reset,gw_features)



In [128]:
#append season gw columns
xg_ema = pd.concat([xg_ema,season_average_no_reset],axis = 1, sort = False)
xg_ma = pd.concat([xg_ma,season_average_no_reset],axis = 1, sort = False)

xg_ema_reset = pd.concat([xg_ema_reset,season_average_season_reset],axis = 1, sort = False)
xg_ma_reset = pd.concat([xg_ma_reset,season_average_season_reset],axis = 1, sort = False)

In [131]:
teams_weekly_summary = pd.concat([xg_ema,xg_ma,xg_ema_reset,xg_ma_reset])
teams_weekly_summary.head()

Unnamed: 0,league,season,matchid,matchdate,fixture,Team,Opponent,GoalsFor,GoalsAgainst,xG,...,season_team_id,xG_av,xGA_av,GoalsFor_av,GoalsAgainst_av,AverageMethod,xG_av_gw,xGA_av_gw,GoalsFor_av_gw,GoalsAgainst_av_gw
0,Bundesliga,2014/15,5448,2014-08-23 14:30:00,Hoffenheim vs Augsburg,Augsburg,Hoffenheim,0.0,2.0,0.280777,...,1,,,,,EMA No reset,,,,
1,Bundesliga,2014/15,5456,2014-08-29 19:30:00,Augsburg vs Borussia Dortmund,Augsburg,Borussia Dortmund,2.0,3.0,1.00422,...,2,,,,,EMA No reset,,,,
2,Bundesliga,2014/15,5418,2014-09-14 14:30:00,Eintracht Frankfurt vs Augsburg,Augsburg,Eintracht Frankfurt,1.0,0.0,1.00028,...,3,0.652546,1.926835,1.027778,2.513889,EMA No reset,1.289009,1.289009,1.25,1.25
3,Bundesliga,2014/15,5425,2014-09-20 14:30:00,Augsburg vs Werder Bremen,Augsburg,Werder Bremen,4.0,2.0,2.91201,...,4,0.774955,1.546326,1.017999,1.628953,EMA No reset,1.323501,1.323501,1.314815,1.314815
4,Bundesliga,2014/15,5434,2014-09-24 19:00:00,Bayer Leverkusen vs Augsburg,Augsburg,Bayer Leverkusen,0.0,1.0,0.571159,...,5,1.354542,1.676043,1.826742,1.729584,EMA No reset,1.292722,1.292722,1.333333,1.333333


In [132]:
teams_weekly_summary['AttStr'] = teams_weekly_summary['xG_av']/teams_weekly_summary['xG_av_gw']
teams_weekly_summary['DefStr'] = teams_weekly_summary['xGA_av']/teams_weekly_summary['xGA_av_gw']

In [134]:
#output
teams_weekly_summary.to_csv(data_folder + "/UnderstatAttDefStr.csv",index = False)

In [121]:
print(season_average_season_reset.info())
print(teams_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21912 entries, 0 to 21911
Data columns (total 4 columns):
xG_gw              20734 non-null float64
xGA_gw             20734 non-null float64
GoalsFor_gw        20734 non-null float64
GoalsAgainst_gw    20734 non-null float64
dtypes: float64(4)
memory usage: 684.9 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21912 entries, 0 to 21911
Data columns (total 18 columns):
league                21912 non-null object
season                21912 non-null object
matchid               21912 non-null int64
matchdate             21912 non-null object
fixture               21912 non-null object
Team                  21912 non-null object
Opponent              21912 non-null object
GoalsFor              20718 non-null float64
GoalsAgainst          20718 non-null float64
xG                    20718 non-null float64
xGA                   20718 non-null float64
season_id             21912 non-null int64
league_teams_count    21912 non-null i

In [133]:
teams_weekly_summary.head()

Unnamed: 0,league,season,matchid,matchdate,fixture,Team,Opponent,GoalsFor,GoalsAgainst,xG,...,xGA_av,GoalsFor_av,GoalsAgainst_av,AverageMethod,xG_av_gw,xGA_av_gw,GoalsFor_av_gw,GoalsAgainst_av_gw,AttStr,DefStr
0,Bundesliga,2014/15,5448,2014-08-23 14:30:00,Hoffenheim vs Augsburg,Augsburg,Hoffenheim,0.0,2.0,0.280777,...,,,,EMA No reset,,,,,,
1,Bundesliga,2014/15,5456,2014-08-29 19:30:00,Augsburg vs Borussia Dortmund,Augsburg,Borussia Dortmund,2.0,3.0,1.00422,...,,,,EMA No reset,,,,,,
2,Bundesliga,2014/15,5418,2014-09-14 14:30:00,Eintracht Frankfurt vs Augsburg,Augsburg,Eintracht Frankfurt,1.0,0.0,1.00028,...,1.926835,1.027778,2.513889,EMA No reset,1.289009,1.289009,1.25,1.25,0.506239,1.494818
3,Bundesliga,2014/15,5425,2014-09-20 14:30:00,Augsburg vs Werder Bremen,Augsburg,Werder Bremen,4.0,2.0,2.91201,...,1.546326,1.017999,1.628953,EMA No reset,1.323501,1.323501,1.314815,1.314815,0.585534,1.16836
4,Bundesliga,2014/15,5434,2014-09-24 19:00:00,Bayer Leverkusen vs Augsburg,Augsburg,Bayer Leverkusen,0.0,1.0,0.571159,...,1.676043,1.826742,1.729584,EMA No reset,1.292722,1.292722,1.333333,1.333333,1.047821,1.296523


In [41]:
test_ema = (xg_ma.groupby(["league","season","season_gw"])["xG_av"].transform(lambda row: row.mean()))
test_ema
#xg_ma['xG_gw_av'] = test_ema
#xg_ma[xg_ma['season_gw']==3]

0             NaN
1             NaN
2        1.289009
3        1.323501
4        1.292722
           ...   
21907    1.467025
21908    1.465242
21909    1.468056
21910    1.478207
21911    1.484021
Name: xG_av, Length: 21912, dtype: float64

In [42]:
for i in range(1,37):
    print(i+1,xg_ma[(xg_ma['league']=='Bundesliga') &
         (xg_ma['season']=='2014/15') &
         (xg_ma['season_gw']<=i)]['xG'].mean())

2 1.2168043333333334
3 1.2890093333333335
4 1.3235012592592597
5 1.2927220555555559
6 1.3034657222222221
7 1.2598887314814813
8 1.2891021587301585
9 1.2815624722222223
10 1.303429679012346
11 1.3038506833333336
12 1.2890660954545456
13 1.2934271384259264
14 1.3320607944444456
15 1.3377768924603182
16 1.3444145292592604
17 1.3400262239583343
18 1.3367526617647068
19 1.337506248148149
20 1.321819278947369
21 1.3138189627777783
22 1.3126177740740748
23 1.3108889484848498
24 1.3229000251207745
25 1.3135314939814828
26 1.3184686008888897
27 1.3129754602564108
28 1.3183919308641991
29 1.322322086111112
30 1.3205612766283532
31 1.321027741481482
32 1.323333443369176
33 1.321628255902778
34 1.3160427978114482
35 1.3120588102941175
36 1.3120588102941175
37 1.3120588102941175


In [52]:
test_features = ["xG_av"]
print(test_features)
test_df = season_average(xg_ma,36,test_features)
test_df.groupby(['league','season','season_gw']).agg({"xG_av_gw":"min"
                                                     ,"xG_av":"mean"}).reset_index().head(50)

['xG_av']


Unnamed: 0,league,season,season_gw,xG_av_gw,xG_av
0,Bundesliga,2014/15,1,,
1,Bundesliga,2014/15,2,,
2,Bundesliga,2014/15,3,1.289009,1.289009
3,Bundesliga,2014/15,4,1.323501,1.323501
4,Bundesliga,2014/15,5,1.292722,1.292722
5,Bundesliga,2014/15,6,1.303466,1.303466
6,Bundesliga,2014/15,7,1.259889,1.259889
7,Bundesliga,2014/15,8,1.289102,1.289102
8,Bundesliga,2014/15,9,1.281562,1.281562
9,Bundesliga,2014/15,10,1.30343,1.30343


In [45]:
span = 30
for feature_name in ["xG","xGA"]:
    new_feature_name = feature_name+"_av"
    #print(feature_name)
    feature_ema = (teams_df.groupby(["Team"])[feature_name]  # Calculate the EMA
                   .transform(lambda row: row.ewm(span=span, min_periods=2)
                              .mean()
                              .shift(1)
                             ))
    teams_df[new_feature_name] = feature_ema

In [46]:
#feature_ema
teams_df

Unnamed: 0,league,season,matchid,matchdate,fixture,Team,Opponent,GoalsFor,GoalsAgainst,xG,xGA,season_id,league_teams_count,season_gw,Points,AwayPoints,HomeGame,season_team_id,xG_av,xGA_av
0,Bundesliga,2014/15,5448,2014-08-23 14:30:00,Hoffenheim vs Augsburg,Augsburg,Hoffenheim,0.0,2.0,0.280777,1.528730,2,18,1,3,0,0,1,,
1,Bundesliga,2014/15,5456,2014-08-29 19:30:00,Augsburg vs Borussia Dortmund,Augsburg,Borussia Dortmund,2.0,3.0,1.004220,2.303420,10,18,1,0,3,1,2,,
2,Bundesliga,2014/15,5418,2014-09-14 14:30:00,Eintracht Frankfurt vs Augsburg,Augsburg,Eintracht Frankfurt,1.0,0.0,1.000280,0.845902,26,18,2,0,3,0,3,0.654556,1.928986
3,Bundesliga,2014/15,5425,2014-09-20 14:30:00,Augsburg vs Werder Bremen,Augsburg,Werder Bremen,4.0,2.0,2.912010,2.024620,33,18,2,3,0,1,4,0.777563,1.543631
4,Bundesliga,2014/15,5434,2014-09-24 19:00:00,Bayer Leverkusen vs Augsburg,Augsburg,Bayer Leverkusen,0.0,1.0,0.571159,1.652350,42,18,3,3,0,0,5,1.365681,1.676161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21907,Serie A,2019/20,13428,2020-04-26 15:00:00,Verona vs Atalanta,Verona,Atalanta,,,,,340,20,18,0,0,1,34,1.435084,1.399695
21908,Serie A,2019/20,13437,2020-05-03 15:00:00,Torino vs Verona,Verona,Torino,,,,,349,20,18,0,0,0,35,1.435084,1.399695
21909,Serie A,2019/20,13446,2020-05-10 15:00:00,Verona vs Lazio,Verona,Lazio,,,,,358,20,18,0,0,1,36,1.435084,1.399695
21910,Serie A,2019/20,13452,2020-05-17 15:00:00,Verona vs SPAL 2013,Verona,SPAL 2013,,,,,364,20,19,0,0,1,37,1.435084,1.399695


In [None]:
def ema_no_reset(stats, span):
    feature_cols = [col for col in ff_multi_line.columns if 'f_' in col]
    feature_cols.remove('f_Team')
    feature_cols.remove('f_HmGame')
    #ema_features = stats[['matchid', 'f_Team', 'gameweek','season','f_HmGame','fixture', 'matchdate','gw_no']].copy()
    ema_features = stats[['f_Team','season','f_HmGame','fixture','gw_no']].copy()
    
    for feature_name in set(feature_cols):
        #print(feature_name)
        feature_ema = (stats.groupby(['f_Team'])[feature_name]  # Calculate the EMA
                       .transform(lambda row: row.ewm(span=span, min_periods=2)
                                  .mean()
                                  .shift(1)
                                 ))
        ema_features[feature_name] = feature_ema
    #create some ratios
    ema_features['f_netxg'] = ema_features['f_US xG']-ema_features['f_US xG Conceded']
    ema_features['f_xGRatio'] = ema_features['f_US xG']/(ema_features['f_US xG']+ema_features['f_US xG Conceded'])
    ema_features['f_ShotsRatio'] = ema_features['f_Goal Attempts']/(ema_features['f_Goal Attempts']+ema_features['f_Shots Conceded'])
    ema_features['f_ShOnTargetRatio'] = ema_features['f_Shots On Target']/(ema_features['f_Goal Attempts'])
    return ema_features

In [None]:
# moves dataset from multiline to single line. Requires Home game to determine Home and away teams,
# requires season and fixture to join on

def restructurebetfair(stats_features):
    non_features = ['season', 'gameweek', 'matchid', 'fixture', 'matchdate','gw_no']
    stats_restructured_home = (stats_features.query('f_HmGame== 1')
                               .rename(columns ={'f_Team':'Team', 'f_HmGame':'HmGame'})
                               .rename(columns={col: col + '_Home' for col in stats_features.columns if col not in non_features})
                              )
    #print(stats_restructured_home.head())
    stats_restructured_away = (stats_features.query('f_HmGame == 0')
                               .rename(columns ={'f_Team':'Team', 'f_HmGame':'HmGame'})
                               .rename(columns={col:col + '_Away' for col in stats_features.columns if col not in non_features})
                              )
    stats_restructured_all = stats_restructured_home.merge(stats_restructured_away, on=['season', 'fixture'], how='inner') 
    #stats_restructured_all = stats_restructured_all.merge(ff_single_line[['matchid','result', 'Home Win', 'Away Win', 'Draw','f_Goals_Home', 'f_Goals_Away']]
    #                                                      .rename(columns = {'f_Goals_Home':'t_Goals_Home', 'f_Goals_Away':'t_Goals_Away'}), on = 'matchid').dropna()#
    stats_restructured_all['gw_no_y'] = stats_restructured_all['gw_no_y'].astype('str')
    stats_restructured_all['gw_no_y'] = np.where(stats_restructured_all['gw_no_y'].str.len()==1, '0' + stats_restructured_all['gw_no_y'],stats_restructured_all['gw_no_y'])
    stats_restructured_all['season_gw'] = stats_restructured_all['season'].astype('str')+stats_restructured_all['gw_no_y']
    #stats_restructured_all = stats_restructred_all.rename(columns = {'f_Home_Gogal'})
    #stats_restructured_away = (stats_features.query('Home == 0').rename(columns={col: 'f_' + col + '_Home' for col in stats_features.columns if col not in non_features}))
    return stats_restructured_all