In [55]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime

DATA_PATH = 'data/'

In [56]:
# TODO: Make scraper module so that we can get data from inside Jupyter Notebook
# Get league data and league_standings

Once we have the data downloaded we can start to create our dataset. We will begin by using data for the last 12 seasons. This should give us enough data to make good predictions, going any further back and the data might not as relevant. 

Due to the nature of football data being time-series data (ie: matches occur over the course of a season) we will be using full seasons (or two seasons) for our test data. We can also use a repeated K-fold to check our accuracy. 

We will first load our data as seperate seasons. We are removing any rows containing NaNs and converting Date to a datetime object, we are also adding a gameId column so that we can process our data easier.

In [57]:
# Run this once to concatenate all seasons together
# df1 = pd.read_csv(os.path.join(DATA_PATH, 'season0708.csv'))
# df2 = pd.read_csv(os.path.join(DATA_PATH, 'season0809.csv'))
# df3 = pd.read_csv(os.path.join(DATA_PATH, 'season0910.csv'))
# df4 = pd.read_csv(os.path.join(DATA_PATH, 'season1011.csv'))
# df5 = pd.read_csv(os.path.join(DATA_PATH, 'season1112.csv'))
# df6 = pd.read_csv(os.path.join(DATA_PATH, 'season1213.csv'))
# df7 = pd.read_csv(os.path.join(DATA_PATH, 'season1314.csv'))
# df8 = pd.read_csv(os.path.join(DATA_PATH, 'season1415.csv'))
# df9 = pd.read_csv(os.path.join(DATA_PATH, 'season1516.csv'))
# df10 = pd.read_csv(os.path.join(DATA_PATH, 'season1617.csv'))
# df11 = pd.read_csv(os.path.join(DATA_PATH, 'season1718.csv'))
# df12 = pd.read_csv(os.path.join(DATA_PATH, 'season1819.csv'))
# df13 = pd.read_csv(os.path.join(DATA_PATH, 'season1920.csv'))

# df = pd.concat([df1, df2, df3, df4, df5, df6, df7,
#                 df8, df9, df10, df11, df12, df13],
#                ignore_index=True, sort=False)
# df.to_csv(os.path.join(DATA_PATH, 'all_seasons_joined.csv'))

In [58]:
def create_df(path):
    """
    Function to convert date to datetime and add 'Id' column
    """
    df = (pd.read_csv(path, dtype={'season': str})
         .assign(Date=lambda df: pd.to_datetime(df.Date))
         .pipe(lambda df: df.dropna(thresh=len(df) - 2, axis=1))  # Drop cols with NAs
         .dropna(axis=0)  # Drop rows with NAs
         .rename(columns={'Unnamed: 0': 'gameId'})
         .sort_values('gameId')
         .reset_index(drop=True)
         )
    return df

In [59]:
df = create_df(os.path.join(DATA_PATH, 'all_seasons_joined.csv'))

In [46]:
df.columns

Index(['gameId', 'Unnamed: 0.1', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG',
       'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST',
       'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D',
       'B365A', 'BWH', 'BWD', 'BWA', 'VCH', 'VCD', 'VCA', 'season'],
      dtype='object')

In order to add exponential moving averages we first need to restructure our dataset so that every row is a seperate team, rather than a match.

In [61]:
# Define a function which restructures our DataFrame
def create_multiline_df_stats(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'season', 'HomeTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
                       'HR', 'AR']
    
    away_stats_cols = ['Date', 'season', 'AwayTeam', 'FTAG', 'FTHG', 'HTAG', 'HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF', 'AC', 'HC', 'AY', 'HY',
                       'AR', 'HR']
    
    stats_cols_mapping = ['Date', 'season', 'Team', 'goalsFor', 'goalsAgainst', 'halfTimeGoalsFor', 'halfTimeGoalsAgainst', 'shotsFor',
                          'shotsAgainst', 'shotsOnTargetFor', 'shotsOnTargetAgainst', 'freesFor', 'freesAgainst', 
                          'cornersFor', 'cornersAgainst', 'yellowsFor', 'yellowsAgainst', 'redsFor', 'redsAgainst']
    
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['gameId'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(homeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['gameId'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(homeGame=0), sort=True)
                    .sort_values(by='gameId') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [62]:
# Define a function which creates an EMA DataFrame from the stats DataFrame
def create_stats_features_ema(stats, span):
    # Create a restructured DataFrames so that we can calculate EMA
    multi_line_stats = create_multiline_df_stats(stats)
    
    # Create a copy of the DataFrame
    ema_features = multi_line_stats[['Date', 'season', 'gameId', 'Team', 'homeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multi_line_stats.drop(columns=['Date', 'season', 'gameId', 'Team', 'homeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multi_line_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

In [63]:
# Add weighted average to each row with a span of 50.
df = create_stats_features_ema(df, 50)
df.tail()

Unnamed: 0,Date,season,gameId,Team,homeGame,cornersAgainst,cornersFor,freesAgainst,freesFor,goalsAgainst,...,halfTimeGoalsAgainst,halfTimeGoalsFor,redsAgainst,redsFor,shotsAgainst,shotsFor,shotsOnTargetAgainst,shotsOnTargetFor,yellowsAgainst,yellowsFor
9373,2019-11-23,1920,4688,Man City,1,2.394241,8.241461,8.321039,9.099733,0.796715,...,0.430477,1.210825,0.030523,0.051923,6.672742,19.254371,2.803048,6.79608,1.463633,1.536028
9374,2019-11-24,1920,4689,Man United,0,4.366679,5.596742,11.869497,10.826574,1.169153,...,0.415808,0.834202,0.044394,0.047382,11.412037,14.247559,4.033247,5.391701,2.12967,1.980482
9375,2019-11-24,1920,4689,Sheffield United,1,6.506138,6.033564,8.08464,10.255848,0.718892,...,0.316085,0.468612,0.0,0.077736,11.463558,10.422463,3.458262,3.160077,1.402459,1.858845
9376,2019-11-24,1920,4690,Aston Villa,1,6.980479,4.315249,11.577326,10.92212,1.902351,...,0.641784,0.439997,0.140163,0.103278,15.400326,11.139602,4.965691,3.893277,1.708601,1.789916
9377,2019-11-24,1920,4690,Newcastle,0,6.074955,4.169981,9.14416,10.013072,1.334469,...,0.51165,0.623525,0.01644,0.091794,13.1789,11.479283,4.157185,3.785123,1.609159,1.545347


In [64]:
df.columns

Index(['Date', 'season', 'gameId', 'Team', 'homeGame', 'cornersAgainst',
       'cornersFor', 'freesAgainst', 'freesFor', 'goalsAgainst', 'goalsFor',
       'halfTimeGoalsAgainst', 'halfTimeGoalsFor', 'redsAgainst', 'redsFor',
       'shotsAgainst', 'shotsFor', 'shotsOnTargetAgainst', 'shotsOnTargetFor',
       'yellowsAgainst', 'yellowsFor'],
      dtype='object')

In [65]:
pd.DataFrame(df.groupby('Team')
               .goalsFor
               .mean()
               .sort_values(ascending=False)[:10])

Unnamed: 0_level_0,goalsFor
Team,Unnamed: 1_level_1
Man City,2.00181
Arsenal,1.913876
Chelsea,1.879607
Man United,1.858164
Liverpool,1.844956
Tottenham,1.695512
Blackpool,1.500244
Everton,1.438372
Leicester,1.398327
Bournemouth,1.290834


We now need to restructure our dataset back to having a match on each row as this will be a much easier format for machine learning. 

In [66]:
def restructure_stats_features(stats_features):
    non_features = ['homeGame', 'Team', 'gameId']

    stats_features_restructured = (stats_features.query('homeGame == 1')
                                    .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('homeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['gameId'])
                                    .dropna())
    return stats_features_restructured

df = restructure_stats_features(df)
df.tail()

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,...,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
4684,2019-11-23,1920,4686,Everton,1,4.565659,6.002476,10.557303,11.607718,1.295098,...,0.88935,0.374763,0.069598,0.033448,14.703841,11.103013,5.259448,3.439076,1.311019,1.576155
4685,2019-11-23,1920,4687,Watford,1,5.55167,4.676105,8.990351,11.080792,1.731395,...,0.761675,0.591053,0.083338,0.014158,15.168061,10.171129,4.825411,3.546055,1.173971,1.876547
4686,2019-11-23,1920,4688,Man City,1,2.394241,8.241461,8.321039,9.099733,0.796715,...,0.452702,0.859091,0.00636,0.008065,9.071821,16.309098,3.200263,5.812427,1.681836,1.526415
4687,2019-11-24,1920,4689,Sheffield United,1,6.506138,6.033564,8.08464,10.255848,0.718892,...,0.415808,0.834202,0.044394,0.047382,11.412037,14.247559,4.033247,5.391701,2.12967,1.980482
4688,2019-11-24,1920,4690,Aston Villa,1,6.980479,4.315249,11.577326,10.92212,1.902351,...,0.51165,0.623525,0.01644,0.091794,13.1789,11.479283,4.157185,3.785123,1.609159,1.545347


In [68]:
len(df.loc[df['f_seasonHome'] == '1920'])

128

In [69]:
df.to_csv(os.path.join(DATA_PATH, 'EMA_data.csv'))