In [1]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime

DATA_PATH = 'data/'

In [2]:
# TODO: Make scraper module so that we can get data from inside Jupyter Notebook
# Get league data and league_standings

Once we have the data downloaded we can start to create our dataset. We will begin by using data for the last 12 seasons. This should give us enough data to make good predictions, going any further back and the data might not as relevant. 

Due to the nature of football data being time-series data (ie: matches occur over the course of a season) we will be using full seasons (or two seasons) for our test data. We can also use a repeated K-fold to check our accuracy. 

We will first load our data as seperate seasons. We are removing any rows containing NaNs and converting Date to a datetime object, we are also adding Id column.

In [3]:
# Run this once to concatenate all seasons together
# df1 = pd.read_csv(os.path.join(DATA_PATH, 'season0506.csv'))
# df2 = pd.read_csv(os.path.join(DATA_PATH, 'season0607.csv'))
df3 = pd.read_csv(os.path.join(DATA_PATH, 'season0708.csv'))
df4 = pd.read_csv(os.path.join(DATA_PATH, 'season0809.csv'))
df5 = pd.read_csv(os.path.join(DATA_PATH, 'season0910.csv'))
df6 = pd.read_csv(os.path.join(DATA_PATH, 'season1011.csv'))
df7 = pd.read_csv(os.path.join(DATA_PATH, 'season1112.csv'))
df8 = pd.read_csv(os.path.join(DATA_PATH, 'season1213.csv'))
df9 = pd.read_csv(os.path.join(DATA_PATH, 'season1314.csv'))
df10 = pd.read_csv(os.path.join(DATA_PATH, 'season1415.csv'))
df11 = pd.read_csv(os.path.join(DATA_PATH, 'season1516.csv'))
df12 = pd.read_csv(os.path.join(DATA_PATH, 'season1617.csv'))
df13 = pd.read_csv(os.path.join(DATA_PATH, 'season1718.csv'))
df14 = pd.read_csv(os.path.join(DATA_PATH, 'season1819.csv'))

df = pd.concat([df3, df4, df5, df6,
                df7, df8, df9, df10,df11,df12,df13,df14],
               ignore_index=True, sort=False)
df.to_csv(os.path.join(DATA_PATH, 'seasons_joined.csv'))

In [122]:
# def create_df(path):
#     """
#     Function to convert date to datetime and add 'Id' column
#     """
#     df = (pd.read_csv(path)
#          .dropna(how='all')
#          .assign(Date=lambda df: pd.to_datetime(df.Date))
#          )
#     return df

In [4]:
def create_df(path):
    """
    Function to convert date to datetime and add 'Id' column
    """
    df = (pd.read_csv(path)
         .assign(Date=lambda df: pd.to_datetime(df.Date))
         .pipe(lambda df: df.dropna(thresh=len(df) - 2, axis=1))  # Drop cols with NAs
         .dropna(axis=0)  # Drop rows with NAs
         .sort_values('Date')
         .reset_index(drop=True)
         .assign(gameId=lambda df: list(df.index + 1))
         )
    return df

In [5]:
df = create_df(os.path.join(DATA_PATH, 'seasons_joined.csv'))

In [6]:
df.shape

(4559, 48)

In [7]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Div', 'Date', 'HomeTeam', 'AwayTeam',
       'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS',
       'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H',
       'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'VCH', 'VCD', 'VCA', 'Bb1X2',
       'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU',
       'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'season', 'gameId'],
      dtype='object')

In [9]:
# Define a function which restructures our DataFrame
def create_multiline_df_stats(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'season', 'HomeTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
                       'HR', 'AR']
    
    away_stats_cols = ['Date', 'season', 'AwayTeam', 'FTAG', 'FTHG', 'HTAG', 'HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF', 'AC', 'HC', 'AY', 'HY',
                       'AR', 'HR']
    
    stats_cols_mapping = ['Date', 'season', 'Team', 'goalsFor', 'goalsAgainst', 'halfTimeGoalsFor', 'halfTimeGoalsAgainst', 'shotsFor',
                          'shotsAgainst', 'shotsOnTargetFor', 'shotsOnTargetAgainst', 'freesFor', 'freesAgainst', 
                          'cornersFor', 'cornersAgainst', 'yellowsFor', 'yellowsAgainst', 'redsFor', 'redsAgainst']
    
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['gameId'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(homeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['gameId'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(homeGame=0), sort=True)
                    .sort_values(by='gameId') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [10]:
# Define a function which creates an EMA DataFrame from the stats DataFrame
def create_stats_features_ema(stats, span):
    # Create a restructured DataFrames so that we can calculate EMA
    multi_line_stats = create_multiline_df_stats(stats)
    
    # Create a copy of the DataFrame
    ema_features = multi_line_stats[['Date', 'season', 'gameId', 'Team', 'homeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multi_line_stats.drop(columns=['Date', 'season', 'gameId', 'Team', 'homeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multi_line_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

In [11]:
# Then we add our weighted average to each row
df = create_stats_features_ema(df, 20)
df.head()

Unnamed: 0,Date,season,gameId,Team,homeGame,cornersAgainst,cornersFor,freesAgainst,freesFor,goalsAgainst,...,halfTimeGoalsAgainst,halfTimeGoalsFor,redsAgainst,redsFor,shotsAgainst,shotsFor,shotsOnTargetAgainst,shotsOnTargetFor,yellowsAgainst,yellowsFor
0,2007-01-09,708,1,Reading,1,,,,,,...,,,,,,,,,,
1,2007-01-09,708,1,West Ham,0,,,,,,...,,,,,,,,,,
2,2007-01-09,708,2,Wigan,0,,,,,,...,,,,,,,,,,
3,2007-01-09,708,2,Newcastle,1,,,,,,...,,,,,,,,,,
4,2007-01-09,708,3,Middlesbrough,1,,,,,,...,,,,,,,,,,


In [12]:
df.columns

Index(['Date', 'season', 'gameId', 'Team', 'homeGame', 'cornersAgainst',
       'cornersFor', 'freesAgainst', 'freesFor', 'goalsAgainst', 'goalsFor',
       'halfTimeGoalsAgainst', 'halfTimeGoalsFor', 'redsAgainst', 'redsFor',
       'shotsAgainst', 'shotsFor', 'shotsOnTargetAgainst', 'shotsOnTargetFor',
       'yellowsAgainst', 'yellowsFor'],
      dtype='object')

In [13]:
pd.DataFrame(df.groupby('Team')
               .goalsFor
               .mean()
               .sort_values(ascending=False)[:10])

Unnamed: 0_level_0,goalsFor
Team,Unnamed: 1_level_1
Man City,2.014375
Arsenal,1.920351
Chelsea,1.882293
Man United,1.877568
Liverpool,1.85434
Tottenham,1.711347
Blackpool,1.506314
Everton,1.439871
Leicester,1.402128
Bournemouth,1.283564


In [14]:
def restructure_stats_features(stats_features):
    non_features = ['homeGame', 'Team', 'gameId']

    stats_features_restructured = (stats_features.query('homeGame == 1')
                                    .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('homeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['gameId'])
                                    .dropna())
    return stats_features_restructured

df = restructure_stats_features(df)
df.head()

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,...,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
21,2007-03-11,708,22,Wigan,1,6.375,1.525,12.375,15.375,1.0,...,0.0,0.0,0.0,0.0,6.625,17.15,3.1,6.575,3.525,2.9
22,2007-03-11,708,23,Aston Villa,1,11.006661,2.965029,10.367194,15.465445,1.866778,...,0.95,0.0,0.0,0.0,20.0,10.1,9.475,6.15,0.525,1.0
23,2007-03-11,708,24,Blackburn,1,2.05,3.625,9.65,22.05,0.475,...,0.0,2.0,0.0,0.0,6.95,21.575,3.0,12.1,2.05,0.0
24,2007-03-11,708,25,Everton,1,9.95,2.85,11.15,11.575,0.475,...,0.95,0.525,0.525,0.0,15.1,7.95,8.05,3.475,0.475,1.95
26,2007-03-11,708,27,Middlesbrough,1,5.0,7.325,11.475,12.2,0.525,...,1.664446,0.933389,0.0,0.367194,8.803497,16.462115,5.531224,9.696087,1.664446,0.664446


In [15]:
df.shape

(4504, 41)

In [16]:
# df = df[['DateHome', 'Id', 'HomeTeam', 'HomeOddsHome',
#        'DrawOddsHome', 'AwayOddsHome', 'GoalsAgainstHome',
#        'GoalsForHome', 'ShotsAgainstHome', 'ShotsForHome',
#        'AwayTeam', 'GoalsAgainstAway', 'GoalsForAway',
#        'ShotsAgainstAway', 'ShotsForAway', 'Result']]

In [17]:

df.to_csv(os.path.join(DATA_PATH, 'EMA_data.csv'))