In [1]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime
from scrapers import *

DATA_PATH = 'data/'

# Get the data

First we need to get league data for the last 13 seasons. We will use seasons 07/08 to 17/18 for training, 18/19 will be for validation. We will use matches the matches that have been played so far in season 19/20 (~120 matches) as our test data.

The functions below scrape data from "football-data.co.uk" and save them as seperate csv files.

In [2]:
# Uncomment lines belwo to scrape data. N.B. there is an error saving season 0405,
# this is ok we only need 07/08 onwards

# scraper = LeagueScraper()
# scraper.get_league_data()
# scraper.close_driver()

# Create league stats dataset

With the data downloaded we can start to create our dataset. We will create a dataset of league stats first. 

We will use data for the last 12 seasons for training our model. This should give us enough data to make good predictions, going any further back and the data might not as relevant. We will use the current season 19/20 for test data.

Due to the nature of football data being time-series data (ie: matches occur over the course of a season) we will use season 1819 as our validation data. I found this to be better than using cross-validation.

In [3]:
df1 = pd.read_csv(os.path.join(DATA_PATH, 'season0708.csv'))
df2 = pd.read_csv(os.path.join(DATA_PATH, 'season0809.csv'))
df3 = pd.read_csv(os.path.join(DATA_PATH, 'season0910.csv'))
df4 = pd.read_csv(os.path.join(DATA_PATH, 'season1011.csv'))
df5 = pd.read_csv(os.path.join(DATA_PATH, 'season1112.csv'))
df6 = pd.read_csv(os.path.join(DATA_PATH, 'season1213.csv'))
df7 = pd.read_csv(os.path.join(DATA_PATH, 'season1314.csv'))
df8 = pd.read_csv(os.path.join(DATA_PATH, 'season1415.csv'))
df9 = pd.read_csv(os.path.join(DATA_PATH, 'season1516.csv'))
df10 = pd.read_csv(os.path.join(DATA_PATH, 'season1617.csv'))
df11 = pd.read_csv(os.path.join(DATA_PATH, 'season1718.csv'))
df12 = pd.read_csv(os.path.join(DATA_PATH, 'season1819.csv'))
df13 = pd.read_csv(os.path.join(DATA_PATH, 'season1920.csv'))

Some of the dfs have missing data on the last row. We will remove these missing rows first. We will also convert the data from string to datetime object.

In [4]:
all_dfs = (df1, df2, df3, df4, df5, df6, df7, 
           df8, df9, df10, df11, df12, df13)
for df in all_dfs:
    df.dropna(subset=['Date'], axis=0, how='all', inplace=True)

In [5]:
def parse_date(date):
    """
    Converts date from string to datetime object.
    """
    return datetime.strptime(date, '%d/%m/%y').date()

def parse_date_other(date):
    """
    Converts date when strptime layout is different
    """
    return datetime.strptime(date, '%d/%m/%Y').date()
    
df1.Date = df1.Date.apply(parse_date)
df2.Date = df2.Date.apply(parse_date)
df3.Date = df3.Date.apply(parse_date)
df4.Date = df4.Date.apply(parse_date)
df5.Date = df5.Date.apply(parse_date)
df6.Date = df6.Date.apply(parse_date)
df7.Date = df7.Date.apply(parse_date)
df8.Date = df8.Date.apply(parse_date)
df9.Date = df9.Date.apply(parse_date)
df10.Date = df10.Date.apply(parse_date)
df11.Date = df11.Date.apply(parse_date_other)
df12.Date = df12.Date.apply(parse_date_other)
df13.Date = df13.Date.apply(parse_date_other)

In [6]:
# Get only the columns we need to get stats
cols = ['Date', 'HomeTeam', 'AwayTeam', 'HS', 'AS', 
        'FTHG','FTAG', 'FTR', 'B365H', 'B365D', 'B365A', 'season']

playing_stats1 = df1[cols]
playing_stats2 = df2[cols]
playing_stats3 = df3[cols]
playing_stats4 = df4[cols]
playing_stats5 = df5[cols]
playing_stats6 = df6[cols]
playing_stats7 = df7[cols]
playing_stats8 = df8[cols]
playing_stats9 = df9[cols]
playing_stats10 = df10[cols]
playing_stats11 = df11[cols]
playing_stats12 = df12[cols]
playing_stats13 = df13[cols]

In [7]:
def get_matchweek(playing_stat):
    """
    Adds matchweek feature to dataset
    """
    j = 1
    MatchWeek = []
    for i in range(len(playing_stat)):
        MatchWeek.append(j)
        if ((i + 1)% 10) == 0:
            j += 1
    playing_stat['MW'] = MatchWeek
    return playing_stat

playing_stats1 = get_matchweek(playing_stats1)
playing_stats2 = get_matchweek(playing_stats2)
playing_stats3 = get_matchweek(playing_stats3)
playing_stats4 = get_matchweek(playing_stats4)
playing_stats5 = get_matchweek(playing_stats5)
playing_stats6 = get_matchweek(playing_stats6)
playing_stats7 = get_matchweek(playing_stats7)
playing_stats8 = get_matchweek(playing_stats8)
playing_stats9 = get_matchweek(playing_stats9)
playing_stats10 = get_matchweek(playing_stats10)
playing_stats11 = get_matchweek(playing_stats11)
playing_stats12 = get_matchweek(playing_stats12)
playing_stats13 = get_matchweek(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [8]:
# Gets the goals scored agg arranged by teams and matchweek
def get_goals_scored(playing_stat):
    # Get the number of matchweeks in the season
    mw = max(playing_stat['MW'])
    
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # build dict where value is a list of goals scored per match
    for i in range(len(playing_stat)):
        HTGS = playing_stat.iloc[i]['FTHG']
        ATGS = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGS)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGS)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    GoalsScored = pd.DataFrame(data=teams, index = [i for i in range(mw)]).T
    GoalsScored[0] = 0
    # Aggregate to get uptil that point
    for i in range(2, mw):
        GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
    return GoalsScored

# Gets the goals conceded agg arranged by teams and matchweek
def get_goals_conceded(playing_stat):
    # Get the number of matchweeks in the season
    mw = max(playing_stat['MW'])
    
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # build dict where value is a list of goals conceded per match
    for i in range(len(playing_stat)):
        ATGC = playing_stat.iloc[i]['FTHG']
        HTGC = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGC)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGC)
    
    # Create a dataframe for goals conceded where rows are teams and cols are matchweek.
    GoalsConceded = pd.DataFrame(data=teams, index = [i for i in range(mw)]).T
    GoalsConceded[0] = 0
    # Aggregate to get uptil that point
    for i in range(1, mw):
        GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
    return GoalsConceded

def get_goal_stats(playing_stat):
    GC = get_goals_conceded(playing_stat)
    GS = get_goals_scored(playing_stat)
   
    j = 0
    HTGS = []
    ATGS = []
    HTGC = []
    ATGC = []

    for i in range(len(playing_stat)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTGS.append(GS.loc[ht][j])
        ATGS.append(GS.loc[at][j])
        HTGC.append(GC.loc[ht][j])
        ATGC.append(GC.loc[at][j])
        
        if ((i + 1)% 10) == 0:
            j += 1
        
    playing_stat['HTGS'] = HTGS
    playing_stat['ATGS'] = ATGS
    playing_stat['HTGC'] = HTGC
    playing_stat['ATGC'] = ATGC
    
    return playing_stat

# Apply to each dataset
playing_stats1 = get_goal_stats(playing_stats1)
playing_stats2 = get_goal_stats(playing_stats2)
playing_stats3 = get_goal_stats(playing_stats3)
playing_stats4 = get_goal_stats(playing_stats4)
playing_stats5 = get_goal_stats(playing_stats5)
playing_stats6 = get_goal_stats(playing_stats6)
playing_stats7 = get_goal_stats(playing_stats7)
playing_stats8 = get_goal_stats(playing_stats8)
playing_stats9 = get_goal_stats(playing_stats9)
playing_stats10 = get_goal_stats(playing_stats10)
playing_stats11 = get_goal_stats(playing_stats11)
playing_stats12 = get_goal_stats(playing_stats12)
playing_stats13 = get_goal_stats(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

In [9]:
# Retrospective points
def get_points(result):
    if result == 'W':
        return 3
    elif result == 'D':
        return 1
    else:
        return 0
    
def get_cumulative_points(matchres, mw):
    matchres_points = matchres.applymap(get_points)
    for i in range(2, mw+1):
        matchres_points[i] = matchres_points[i] + matchres_points[i-1]
        
    matchres_points.insert(column =0, loc = 0, value = [0*i for i in range(20)])
    return matchres_points

def get_match_result(playing_stat):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []

    # build dict where value is list of match results
    for i in range(len(playing_stat)):
        if playing_stat.iloc[i].FTR == 'H':
            teams[playing_stat.iloc[i].HomeTeam].append('W')
            teams[playing_stat.iloc[i].AwayTeam].append('L')
        elif playing_stat.iloc[i].FTR == 'A':
            teams[playing_stat.iloc[i].AwayTeam].append('W')
            teams[playing_stat.iloc[i].HomeTeam].append('L')
        else:
            teams[playing_stat.iloc[i].AwayTeam].append('D')
            teams[playing_stat.iloc[i].HomeTeam].append('D')
            
    return pd.DataFrame(data=teams, index = [i for i in range(1, max(playing_stat['MW']+1))]).T

def get_agg_points(playing_stat):
    matchres = get_match_result(playing_stat)
    cum_pts = get_cumulative_points(matchres, max(playing_stat['MW']))
    HTP = []
    ATP = []
    j = 0
    for i in range(len(playing_stat)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTP.append(cum_pts.loc[ht][j])
        ATP.append(cum_pts.loc[at][j])

        if ((i + 1)% 10) == 0:
            j += 1
            
    playing_stat['HTP'] = HTP
    playing_stat['ATP'] = ATP
    return playing_stat

playing_stats1 = get_agg_points(playing_stats1)
playing_stats2 = get_agg_points(playing_stats2)
playing_stats3 = get_agg_points(playing_stats3)
playing_stats4 = get_agg_points(playing_stats4)
playing_stats5 = get_agg_points(playing_stats5)
playing_stats6 = get_agg_points(playing_stats6)
playing_stats7 = get_agg_points(playing_stats7)
playing_stats8 = get_agg_points(playing_stats8)
playing_stats9 = get_agg_points(playing_stats9)
playing_stats10 = get_agg_points(playing_stats10)
playing_stats11 = get_agg_points(playing_stats11)
playing_stats12 = get_agg_points(playing_stats12)
playing_stats13 = get_agg_points(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
def get_form(playing_stat,num):
    form = get_match_result(playing_stat)
    form_final = form.copy()
    for i in range(num, max(playing_stat['MW'])+1):
        form_final[i] = ''
        j = 0
        while j < num:
            form_final[i] += form[i-j]
            j += 1
    return form_final

def add_form(playing_stat,num):
    form = get_form(playing_stat,num)
    h = ['M' for i in range(num * 10)]  # since form is not available for n MW (n*10)
    a = ['M' for i in range(num * 10)]
    
    j = num
    for i in range((num*10),playing_stat.shape[0]):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        
        past = form.loc[ht][j]  # get past n results
        h.append(past[num-1])   # 0 index is most recent
        
        past = form.loc[at][j]  # get past n results.
        a.append(past[num-1])   # 0 index is most recent
        
        if ((i + 1)% 10) == 0:
            j = j + 1

    playing_stat['HM' + str(num)] = h                 
    playing_stat['AM' + str(num)] = a

    
    return playing_stat


def add_form_df(playing_statistics):
    playing_statistics = add_form(playing_statistics,1)
    playing_statistics = add_form(playing_statistics,2)
    playing_statistics = add_form(playing_statistics,3)
    playing_statistics = add_form(playing_statistics,4)
    playing_statistics = add_form(playing_statistics,5)
    return playing_statistics

playing_stats1 = add_form_df(playing_stats1)
playing_stats2 = add_form_df(playing_stats2)
playing_stats3 = add_form_df(playing_stats3)
playing_stats4 = add_form_df(playing_stats4)
playing_stats5 = add_form_df(playing_stats5)
playing_stats6 = add_form_df(playing_stats6)
playing_stats7 = add_form_df(playing_stats7)
playing_stats8 = add_form_df(playing_stats8)
playing_stats9 = add_form_df(playing_stats9)
playing_stats10 = add_form_df(playing_stats10)
playing_stats11 = add_form_df(playing_stats11)
playing_stats12 = add_form_df(playing_stats12)
playing_stats13 = add_form_df(playing_stats13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
# Rearranging columns
cols = ['Date', 'season', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTGS', 'ATGS', 
        'HTGC', 'ATGC', 'HTP', 'ATP', 'B365H', 'B365D', 
        'B365A', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'MW']

playing_stats1 = playing_stats1[cols]
playing_stats2 = playing_stats2[cols]
playing_stats3 = playing_stats3[cols]
playing_stats4 = playing_stats4[cols]
playing_stats5 = playing_stats5[cols]
playing_stats6 = playing_stats6[cols]
playing_stats7 = playing_stats7[cols]
playing_stats8 = playing_stats8[cols]
playing_stats9 = playing_stats9[cols]
playing_stats10 = playing_stats10[cols]
playing_stats11 = playing_stats11[cols]
playing_stats12 = playing_stats12[cols]
playing_stats13 = playing_stats13[cols]

The function below adds previous league position as a feature. If the team has been promoted we will use 18 as a default.

To scrape this information from wikipedia uncomment the code below.

In [12]:
# scraper = LeaguePositionScraper()
# scraper.run()
# scraper.close_driver()

In [13]:
standings = pd.read_csv(DATA_PATH + 'league_standings.csv')

def get_league_pos(playing_stat, standings, year):
    HomeTeamLP = []
    AwayTeamLP = []
        
    for i in range(len(playing_stat)): 
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        # If team was promoted set to default pos of 18th
        try:
            LP = standings.loc[(standings['Team'] == ht) & (standings['Season'] == year )].Position.item()
            HomeTeamLP.append(LP)
        except ValueError as e:
            HomeTeamLP.append(18)
        
        try:
            LP = standings.loc[(standings['Team'] == at) & (standings['Season'] == year )].Position.item()
            AwayTeamLP.append(LP)
        except ValueError as e:
            AwayTeamLP.append(18)

    playing_stat['HomeTeamLP'] = HomeTeamLP
    playing_stat['AwayTeamLP'] = AwayTeamLP

    return playing_stat

playing_stats1 = get_league_pos(playing_stats1, standings, 6)
playing_stats2 = get_league_pos(playing_stats2, standings, 7)
playing_stats3 = get_league_pos(playing_stats3, standings, 8)
playing_stats4 = get_league_pos(playing_stats4, standings, 9)
playing_stats5 = get_league_pos(playing_stats5, standings, 10)
playing_stats6 = get_league_pos(playing_stats6, standings, 11)
playing_stats7 = get_league_pos(playing_stats7, standings, 12)
playing_stats8 = get_league_pos(playing_stats8, standings, 13)
playing_stats9 = get_league_pos(playing_stats9, standings, 14)
playing_stats10 = get_league_pos(playing_stats10, standings, 15)
playing_stats11 = get_league_pos(playing_stats11, standings, 16)
playing_stats12 = get_league_pos(playing_stats12, standings, 17)
playing_stats13 = get_league_pos(playing_stats13, standings, 17)

  if sys.path[0] == '':


In [14]:
# Concatenate seasons and add gameId so we can index data easier
playing_stats = (pd.concat([playing_stats1,
                          playing_stats2,
                          playing_stats3,
                          playing_stats4,
                          playing_stats5,
                          playing_stats6,
                          playing_stats7,
                          playing_stats8,
                          playing_stats9,
                          playing_stats10,
                          playing_stats11,
                          playing_stats12,
                          playing_stats13], ignore_index=True)
                         .assign(gameId=lambda df: list(df.index + 1))
                         .sort_values('gameId'))

In [15]:
# Gets the form points.
def get_form_points(string):
    total = 0
    for letter in string:
        total += get_points(letter)
    return total

playing_stats['HTFormPtsStr'] = playing_stats['HM1'] + playing_stats['HM2'] + playing_stats['HM3'] + playing_stats['HM4'] + playing_stats['HM5']
playing_stats['ATFormPtsStr'] = playing_stats['AM1'] + playing_stats['AM2'] + playing_stats['AM3'] + playing_stats['AM4'] + playing_stats['AM5']

playing_stats['HTFormPts'] = playing_stats['HTFormPtsStr'].apply(get_form_points)
playing_stats['ATFormPts'] = playing_stats['ATFormPtsStr'].apply(get_form_points)

In [16]:
# Get doal difference
playing_stats['HTGD'] = playing_stats['HTGS'] - playing_stats['HTGC']
playing_stats['ATGD'] = playing_stats['ATGS'] - playing_stats['ATGC']

# Diff in points
playing_stats['DiffPts'] = playing_stats['HTP'] - playing_stats['ATP']

# Difference in form points, last 5 games
playing_stats['DiffFormPts'] = playing_stats['HTFormPts'] - playing_stats['ATFormPts']

# Uncomment this line to create unscaled df for predictions
# this_season = playing_stats.loc[playing_stats['season'] == 1920]
# this_season.to_csv(os.path.join(DATA_PATH, 'season1920_data.csv'))

# Diff in last years league positions
playing_stats['DiffLP'] = playing_stats['HomeTeamLP'] - playing_stats['AwayTeamLP']

In [17]:
# Scale DiffPts , DiffFormPts, HTGD, ATGD by Matchweek.
cols = ['HTGD','ATGD','DiffPts','DiffFormPts','HTP','ATP']
playing_stats.MW = playing_stats.MW.astype(float)

for col in cols:
    playing_stats[col] = playing_stats[col] / playing_stats.MW

In [18]:
playing_stats = playing_stats.drop(['FTHG', 'FTAG', 'HomeTeamLP', 'AwayTeamLP'], 1)
playing_stats.to_csv(os.path.join(DATA_PATH, 'league_data.csv'))

# Create EMA dataset

We will now create a dataset of exponential moving-averages(EMA). To do this we first need to split the data so that each team is on a seperate row, instead of a match per row. This will make it easier to get the moving average. We then reassemble the data back to a match per row.

In [19]:
# Run this once to concatenate all seasons together
# df1 = pd.read_csv(os.path.join(DATA_PATH, 'season0708.csv'))
# df2 = pd.read_csv(os.path.join(DATA_PATH, 'season0809.csv'))
# df3 = pd.read_csv(os.path.join(DATA_PATH, 'season0910.csv'))
# df4 = pd.read_csv(os.path.join(DATA_PATH, 'season1011.csv'))
# df5 = pd.read_csv(os.path.join(DATA_PATH, 'season1112.csv'))
# df6 = pd.read_csv(os.path.join(DATA_PATH, 'season1213.csv'))
# df7 = pd.read_csv(os.path.join(DATA_PATH, 'season1314.csv'))
# df8 = pd.read_csv(os.path.join(DATA_PATH, 'season1415.csv'))
# df9 = pd.read_csv(os.path.join(DATA_PATH, 'season1516.csv'))
# df10 = pd.read_csv(os.path.join(DATA_PATH, 'season1617.csv'))
# df11 = pd.read_csv(os.path.join(DATA_PATH, 'season1718.csv'))
# df12 = pd.read_csv(os.path.join(DATA_PATH, 'season1819.csv'))
# df13 = pd.read_csv(os.path.join(DATA_PATH, 'season1920.csv'))

# df = pd.concat([df1, df2, df3, df4, df5, df6, df7,
#                 df8, df9, df10, df11, df12, df13],
#                ignore_index=True, sort=False)
# df.to_csv(os.path.join(DATA_PATH, 'all_seasons_joined.csv'))

In [27]:
def create_df(path):
    """
    Function to convert date to datetime and sort by gameId
    """
    df = (pd.read_csv(path, dtype={'season': str})
         .assign(Date=lambda df: pd.to_datetime(df.Date))
         .pipe(lambda df: df.dropna(thresh=len(df) - 2, axis=1))  # Drop cols with NAs
         .dropna(axis=0)  # Drop rows with NAs
         .rename(columns={'Unnamed: 0': 'gameId'})
         .sort_values('gameId')
         .reset_index(drop=True)
         )
    return df

df = create_df(os.path.join(DATA_PATH, 'all_seasons_joined.csv'))

In [28]:
def create_multiline_df_stats(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'season', 'HomeTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
                       'HR', 'AR']
    
    away_stats_cols = ['Date', 'season', 'AwayTeam', 'FTAG', 'FTHG', 'HTAG', 'HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF', 'AC', 'HC', 'AY', 'HY',
                       'AR', 'HR']
    
    stats_cols_mapping = ['Date', 'season', 'Team', 'goalsFor', 'goalsAgainst', 'halfTimeGoalsFor', 'halfTimeGoalsAgainst', 'shotsFor',
                          'shotsAgainst', 'shotsOnTargetFor', 'shotsOnTargetAgainst', 'freesFor', 'freesAgainst', 
                          'cornersFor', 'cornersAgainst', 'yellowsFor', 'yellowsAgainst', 'redsFor', 'redsAgainst']
    
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['gameId'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(homeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['gameId'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(homeGame=0), sort=True)
                    .sort_values(by='gameId') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [29]:
def create_stats_features_ema(stats, span):
    # Create a restructured DataFrames so that we can calculate EMA
    multi_line_stats = create_multiline_df_stats(stats)
    
    # Create a copy of the DataFrame
    ema_features = multi_line_stats[['Date', 'season', 'gameId', 'Team', 'homeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multi_line_stats.drop(columns=['Date', 'season', 'gameId', 'Team', 'homeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multi_line_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

In [30]:
# Add weighted average to each row with a span of 50.
df = create_stats_features_ema(df, 50)
df.tail()

Unnamed: 0,Date,season,gameId,Team,homeGame,cornersAgainst,cornersFor,freesAgainst,freesFor,goalsAgainst,...,halfTimeGoalsAgainst,halfTimeGoalsFor,redsAgainst,redsFor,shotsAgainst,shotsFor,shotsOnTargetAgainst,shotsOnTargetFor,yellowsAgainst,yellowsFor
9373,2019-11-23,1920,4688,Man City,1,2.394241,8.241461,8.321039,9.099733,0.796715,...,0.430477,1.210825,0.030523,0.051923,6.672742,19.254371,2.803048,6.79608,1.463633,1.536028
9374,2019-11-24,1920,4689,Man United,0,4.366679,5.596742,11.869497,10.826574,1.169153,...,0.415808,0.834202,0.044394,0.047382,11.412037,14.247559,4.033247,5.391701,2.12967,1.980482
9375,2019-11-24,1920,4689,Sheffield United,1,6.506138,6.033564,8.08464,10.255848,0.718892,...,0.316085,0.468612,0.0,0.077736,11.463558,10.422463,3.458262,3.160077,1.402459,1.858845
9376,2019-11-24,1920,4690,Aston Villa,1,6.980479,4.315249,11.577326,10.92212,1.902351,...,0.641784,0.439997,0.140163,0.103278,15.400326,11.139602,4.965691,3.893277,1.708601,1.789916
9377,2019-11-24,1920,4690,Newcastle,0,6.074955,4.169981,9.14416,10.013072,1.334469,...,0.51165,0.623525,0.01644,0.091794,13.1789,11.479283,4.157185,3.785123,1.609159,1.545347


The span parameter controls the decay for EMA. I tried various settings and found 50 to be the most useful.

Now that we have added the EMA stats we can restructure the dataset back to having a match on each row. We then save it as a csv file. 

In [31]:
def restructure_stats_features(stats_features):
    non_features = ['homeGame', 'Team', 'gameId']

    stats_features_restructured = (stats_features.query('homeGame == 1')
                                    .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('homeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['gameId'])
                                    .dropna())
    return stats_features_restructured

df = restructure_stats_features(df)
df.tail()

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,...,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
4684,2019-11-23,1920,4686,Everton,1,4.565659,6.002476,10.557303,11.607718,1.295098,...,0.88935,0.374763,0.069598,0.033448,14.703841,11.103013,5.259448,3.439076,1.311019,1.576155
4685,2019-11-23,1920,4687,Watford,1,5.55167,4.676105,8.990351,11.080792,1.731395,...,0.761675,0.591053,0.083338,0.014158,15.168061,10.171129,4.825411,3.546055,1.173971,1.876547
4686,2019-11-23,1920,4688,Man City,1,2.394241,8.241461,8.321039,9.099733,0.796715,...,0.452702,0.859091,0.00636,0.008065,9.071821,16.309098,3.200263,5.812427,1.681836,1.526415
4687,2019-11-24,1920,4689,Sheffield United,1,6.506138,6.033564,8.08464,10.255848,0.718892,...,0.415808,0.834202,0.044394,0.047382,11.412037,14.247559,4.033247,5.391701,2.12967,1.980482
4688,2019-11-24,1920,4690,Aston Villa,1,6.980479,4.315249,11.577326,10.92212,1.902351,...,0.51165,0.623525,0.01644,0.091794,13.1789,11.479283,4.157185,3.785123,1.609159,1.545347


In [32]:
df.to_csv(os.path.join(DATA_PATH, 'EMA_data.csv'))