In [2]:
import os
import pandas as pd
from scrapers import *
from keras.models import load_model
from sklearn.preprocessing import StandardScaler

DATA_PATH = 'data/'

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Predict upcoming games

To predict upcoming games we will need to:
1. Scrape upcoming fixtures with odds
2. Concatentate these onto results so far for season 19/20
3. Use the same functions previously to create data for our model

# Get EMA stats

The most tricky part is getting EMA stats for the upcoming fixtures, the league stats should be straight forward. 

In [3]:
scraper = FixtureScraper()
fixtures = scraper.get_fixtures()
scraper.close_driver()

fixtures.replace({'Man Utd': 'Man United',
                  'C Palace': 'Crystal Palace',
                  'Sheff Utd': 'Sheffield United',
                  'Sheffield Utd': 'Sheffield United'},
                   inplace=True)

In [4]:
df_ema = (pd.read_csv(os.path.join(DATA_PATH, 'all_seasons_joined.csv'))
                    .assign(Date=lambda df: pd.to_datetime(df.Date))
                    .pipe(lambda df: df.dropna(thresh=len(df) - 2, axis=1))  # Drop cols with NAs
                    .dropna(axis=0)  # Drop rows with NAs
                    .sort_values('Unnamed: 0')
                    .append(fixtures, sort=True)
                    .reset_index(drop=True)
                    .assign(gameId=lambda df: list(df.index + 1))
                    )

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df_ema.tail()

Unnamed: 0.2,AC,AF,AR,AS,AST,AY,AwayTeam,B365A,B365D,B365H,...,HY,HomeTeam,Referee,Unnamed: 0,Unnamed: 0.1,VCA,VCD,VCH,season,gameId
4694,,,,,,,Watford,3.5,3.4,2.05,...,,Southampton,,,,,,,1920,4695
4695,,,,,,,Arsenal,1.75,3.8,4.2,...,,Norwich,,,,,,,1920,4696
4696,,,,,,,Sheffield United,4.0,3.4,1.9,...,,Wolves,,,,,,,1920,4697
4697,,,,,,,Aston Villa,6.5,4.5,1.44,...,,Man United,,,,,,,1920,4698
4698,,,,,,,Everton,5.0,3.8,1.65,...,,Leicester,,,,,,,1920,4699


In [6]:
# Drop betting odds as we can use league stats function to get these
df_ema.drop(['B365H', 'B365D', 'B365A'], 1, inplace=True)

In [7]:
def create_multiline_df_stats(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'season', 'HomeTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
                       'HR', 'AR']
    
    away_stats_cols = ['Date', 'season', 'AwayTeam', 'FTAG', 'FTHG', 'HTAG', 'HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF', 'AC', 'HC', 'AY', 'HY',
                       'AR', 'HR']
    
    stats_cols_mapping = ['Date', 'season', 'Team', 'goalsFor', 'goalsAgainst', 'halfTimeGoalsFor', 
                          'halfTimeGoalsAgainst', 'shotsFor', 'shotsAgainst', 'shotsOnTargetFor',
                          'shotsOnTargetAgainst', 'freesFor', 'freesAgainst', 'cornersFor', 
                          'cornersAgainst', 'yellowsFor', 'yellowsAgainst', 'redsFor', 'redsAgainst']
    
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['gameId'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(homeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['gameId'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(homeGame=0), sort=True)
                    .sort_values(by='gameId') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [8]:
def create_stats_features_ema(stats, span):
    # Create a restructured DataFrames so that we can calculate EMA
    multi_line_stats = create_multiline_df_stats(stats)
    
    # Create a copy of the DataFrame
    ema_features = multi_line_stats[['Date', 'season', 'gameId', 'Team', 'homeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multi_line_stats.drop(columns=['Date', 'season', 'gameId', 'Team', 'homeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multi_line_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

In [9]:
df_ema = create_stats_features_ema(df_ema, 50)
df_ema.tail()

Unnamed: 0,Date,season,gameId,Team,homeGame,cornersAgainst,cornersFor,freesAgainst,freesFor,goalsAgainst,...,halfTimeGoalsAgainst,halfTimeGoalsFor,redsAgainst,redsFor,shotsAgainst,shotsFor,shotsOnTargetAgainst,shotsOnTargetFor,yellowsAgainst,yellowsFor
9393,30/11/19,1920,4697,Wolves,1,5.337415,5.232778,9.18389,10.95792,1.325211,...,0.664515,0.486696,0.123622,0.083231,12.844085,11.969167,4.395726,4.095568,1.840712,1.854479
9394,30/11/19,1920,4698,Aston Villa,0,6.824382,4.381318,11.51547,11.003606,1.945397,...,0.616616,0.461958,0.134666,0.099228,15.227764,11.173344,4.967037,4.054325,1.720029,1.758939
9395,30/11/19,1920,4698,Man United,1,4.430731,5.494909,11.91383,10.79416,1.240951,...,0.438717,0.801489,0.042653,0.045524,11.435094,14.120204,4.188806,5.37634,2.085369,1.981248
9396,30/11/19,1920,4699,Leicester,1,4.738357,6.083126,10.369843,9.637158,1.014663,...,0.566415,0.570298,0.137931,0.061986,10.755939,13.98075,3.479016,5.0507,1.692849,1.236327
9397,30/11/19,1920,4699,Everton,0,4.661124,6.041594,10.574664,11.623101,1.322742,...,0.428907,0.623753,0.094323,0.104197,9.98769,13.30182,3.673723,4.596235,1.973455,1.517746


In [10]:
def restructure_stats_features(stats_features):
    non_features = ['homeGame', 'Team', 'gameId']

    stats_features_restructured = (stats_features.query('homeGame == 1')
                                    .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('homeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['gameId'])
                                    .dropna())
    return stats_features_restructured

df_ema = restructure_stats_features(df_ema)
df_ema.tail()

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,...,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
4694,30/11/19,1920,4695,Southampton,1,6.122353,4.885765,9.429858,11.235308,1.987825,...,0.802975,0.373721,0.039294,0.107389,13.166968,11.909246,5.111947,3.678857,1.572426,2.055737
4695,30/11/19,1920,4696,Norwich,1,6.25627,4.943509,10.230639,10.486938,1.867071,...,0.656924,0.72912,0.039354,0.078882,14.52167,12.727828,5.01615,4.547976,1.961493,2.154146
4696,30/11/19,1920,4697,Wolves,1,5.337415,5.232778,9.18389,10.95792,1.325211,...,0.285518,0.52,0.0,0.070219,11.418729,10.575018,3.607355,3.62812,1.460244,1.775791
4697,30/11/19,1920,4698,Man United,1,4.430731,5.494909,11.91383,10.79416,1.240951,...,0.616616,0.461958,0.134666,0.099228,15.227764,11.173344,4.967037,4.054325,1.720029,1.758939
4698,30/11/19,1920,4699,Leicester,1,4.738357,6.083126,10.369843,9.637158,1.014663,...,0.428907,0.623753,0.094323,0.104197,9.98769,13.30182,3.673723,4.596235,1.973455,1.517746


In [11]:
df_ema.drop(['f_DateHome', 'f_seasonHome', 'HomeTeam',
               'homeGame_x', 'f_yellowsAgainstAway', 'f_yellowsForAway',
               'f_halfTimeGoalsAgainstHome', 'f_halfTimeGoalsForHome','f_halfTimeGoalsAgainstAway', 'f_halfTimeGoalsForAway',
               'f_yellowsAgainstHome', 'f_yellowsForHome', 'f_DateAway', 'f_seasonAway', 
               'AwayTeam', 'homeGame_y'], 1, inplace=True)

In [12]:
df_ema.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

# Get League stats

To get the league stats we will use the same 'features' DataFrame that we used to create EMA stats. We can then use the same functions we used previously to create league data.

In [13]:
def get_points(goals_scored, goals_conceded):
    if goals_scored > goals_conceded: 
        points = 3
    elif goals_conceded == goals_scored:
        points = 1
    else:
        points = 9
    return points

def get_result(goals_scored, goals_conceded):
    if goals_scored > goals_conceded: 
        result = 'W'
    elif goals_conceded == goals_scored:
        result = 'D'
    else:
        result = 'L'
    return result


def get_last_result(team):
    """
    Helper function which takes a team-name as a string and returns goal stats from
    their most recent match
    """
    with open(DATA_PATH + 'season1920.csv') as f:
        for row in reversed(list(csv.reader(f))):
            if team == row[4]:
                goals_scored = int(row[6])
                goals_conceded = int(row[7])

                return(goals_scored, goals_conceded)
            elif team == row[5]:
                goals_scored = int(row[7])
                goals_conceded = int(row[6])

                return goals_scored, goals_conceded

In [14]:
def create_row(home_team, away_team, home_odds, draw_odds, away_odds, matchweek):
    # First get last result data
    home_goals_scored, home_goals_against = get_last_result(home_team)
    home_result = get_result(home_goals_scored, home_goals_against)
    home_points = get_points(home_goals_scored, home_goals_against)
    
    away_goals_scored, away_goals_against = get_last_result(away_team)
    away_result = get_result(away_goals_scored, away_goals_against)
    away_points = get_points(away_goals_scored, away_goals_against)
    
    # Combine this with team data we already have, I created a csv
    # using existing fucntions without scaling.   
    with open(os.path.join(DATA_PATH, 'season1920_data.csv')) as f:
        reader = csv.reader(f)
        next(reader, None)
        for row in reader:
            if home_team == row[3]:
                htp = (int(row[12]) + home_points) / matchweek
                hm1 = home_result
                hm2 = row[17]
                hm3 = row[18]
                htgd = (float(row[35]) + home_goals_scored - home_goals_against) / matchweek
                ht_prev_pos = int(row[28])
                ht_form_pts = int(row[33])
                
            elif away_team == row[3]:
                atp = (int(row[12]) + away_points) / matchweek
                am1 = away_result
                am2 = row[17]
                am3 = row[18]
                atgd = (float(row[35]) + away_goals_scored - away_goals_against) / matchweek
                at_prev_pos = int(row[28])
                at_form_pts = int(row[33])
                
            elif home_team == row[4]:
                htp = (int(row[13]) + home_points) / matchweek
                hm1 = home_result
                hm2 = row[22]
                hm3 = row[23]
                htgd = (float(row[36]) + home_goals_scored - home_goals_against) / matchweek
                ht_prev_pos = int(row[29])
                ht_form_pts = int(row[34])
            elif away_team == row[4]:
                atp = (int(row[13]) + away_points) / matchweek
                am1 = away_result
                am2 = row[22]
                am3 = row[23]
                atgd = (float(row[36]) + away_goals_scored - away_goals_against) / matchweek
                at_prev_pos = int(row[29])
                at_form_pts = int(row[34])

        diff_lp = ht_prev_pos - at_prev_pos
        diff_points = (atp - htp) / matchweek
        diff_form_points = (ht_form_pts - at_form_pts) / matchweek
          
        row = [htp, atp, home_odds, draw_odds, away_odds, 
               hm1, hm2, hm3, am1, am2, am3, htgd, atgd, 
               diff_points, diff_form_points, diff_lp]
        
        # print(row)
        return row

In [15]:
headers = ['HTP', 'ATP', 'B365H', 'B365D', 'B365A', 
           'HM1', 'HM2', 'HM3', 'AM1', 'AM2', 'AM3', 
           'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP']

with open(os.path.join(DATA_PATH, 'fixtures_league_data.csv'), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    for i, row in fixtures.iterrows():
        # Last item in row is MatchWeek, entering manually for now
        row = create_row(row.HomeTeam, row.AwayTeam, row.B365H, 
                         row.B365D, row.B365A, 14.0)
        writer.writerow(row)

In [16]:
len(df_ema.columns)

25

# Combine datasets


In [17]:
# Load league data from csv file
league_data = pd.read_csv('data/fixtures_league_data.csv')
league_data.drop(['HM1', 'HM2', 'HM3', 
                  'AM1', 'AM2', 'AM3'], 1, inplace=True)

In [18]:
len(league_data.columns)

10

In [19]:
len(league_data)

10

In [20]:
# We only need upcoming fixtures from EMA stats
df_ema1920 = df_ema.tail(len(fixtures)).reset_index()

In [21]:
df = pd.concat([df_ema1920, league_data], axis=1)

In [22]:
df.drop(['index', 'gameId', ], 1, inplace=True)
df

Unnamed: 0,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,...,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,6.072016,4.124099,9.295369,10.01256,1.399784,1.218812,0.015796,0.088194,13.132669,11.460488,...,0.928571,2.0,13.0,7.5,1.16,-0.428571,1.285714,0.076531,-0.357143,9
1,5.045049,5.40636,10.505461,9.841266,1.205957,1.661953,0.026282,0.124352,12.718355,13.88848,...,1.142857,1.785714,1.36,5.0,7.5,0.0,-0.071429,0.045918,-0.071429,-9
2,3.59274,5.717618,10.775021,8.962049,1.193844,1.813473,0.00611,0.007749,9.304298,16.100898,...,2.285714,1.571429,1.25,5.25,13.0,0.928571,-0.142857,-0.05102,0.928571,-13
3,4.05936,6.469869,9.174339,8.413665,0.754456,2.318409,0.006359,0.023234,8.950306,15.711005,...,2.428571,1.714286,1.2,6.5,13.0,1.142857,-0.5,-0.05102,0.285714,-11
4,6.159495,4.639604,10.166842,9.872408,1.525016,1.345427,0.080069,0.013602,15.161468,10.203643,...,1.071429,1.714286,2.25,3.2,3.25,-0.071429,-0.5,0.045918,-0.214286,-11
5,6.122353,4.885765,9.429858,11.235308,1.987825,1.134511,0.104997,0.102143,14.266798,11.997247,...,0.642857,1.0,2.05,3.4,3.5,-1.071429,-1.071429,0.02551,-0.142857,3
6,6.25627,4.943509,10.230639,10.486938,1.867071,0.932813,0.066865,0.032134,14.833278,11.177506,...,0.714286,1.285714,4.2,3.8,1.75,-0.857143,-0.142857,0.040816,-0.357143,12
7,5.337415,5.232778,9.18389,10.95792,1.325211,1.305084,0.123622,0.083231,12.844085,11.969167,...,1.142857,1.214286,1.9,3.4,4.0,0.142857,0.285714,0.005102,0.071429,0
8,4.430731,5.494909,11.91383,10.79416,1.240951,1.531578,0.042653,0.045524,11.435094,14.120204,...,1.0,0.857143,1.44,4.5,6.5,0.0,-0.071429,-0.010204,-0.142857,-16
9,4.738357,6.083126,10.369843,9.637158,1.014663,1.837836,0.137931,0.061986,10.755939,13.98075,...,1.857143,1.428571,1.65,3.8,5.0,1.642857,-0.5,-0.030612,0.571429,1


# Make Predictions

Now that we have prepared the data we can load our model and make predictions.

In [23]:
scaler = StandardScaler()
X = scaler.fit_transform(df)

In [24]:
model = load_model('25Nov19.h5')




In [25]:
predictions = model.predict(X)
predictions

array([[8.6416298e-04, 2.4121420e-02, 9.7501445e-01],
       [4.9304920e-01, 2.8723446e-01, 2.1971639e-01],
       [8.6349219e-01, 1.2675029e-01, 9.7575868e-03],
       [9.8903263e-01, 1.0140523e-02, 8.2687690e-04],
       [5.9478873e-01, 2.8379083e-01, 1.2142048e-01],
       [3.7165302e-01, 3.6037287e-01, 2.6797411e-01],
       [5.5383142e-02, 2.3890764e-01, 7.0570928e-01],
       [4.2574670e-02, 2.7717835e-01, 6.8024695e-01],
       [2.6989689e-01, 4.9470967e-01, 2.3539343e-01],
       [9.8742563e-01, 1.1315062e-02, 1.2593878e-03]], dtype=float32)

In [26]:
for i in range(len(X)):
    pred = np.argmax(predictions[i])
    if pred == 0:
        odds_diff = predictions[i][pred] - (1/league_data['B365H'][i])
        print(i+1, 'Prediction:', pred, 'Odds diff:', odds_diff)
    elif pred == 1:
        odds_diff = predictions[i][pred] - (1/league_data['B365D'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)
    else:
        odds_diff = predictions[i][pred] - (1/league_data['B365A'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)

0 Prediction: 2 Odds diff: 0.11294548264865212
2 Prediction: 0 Odds diff: -0.24224491329754094
3 Prediction: 0 Odds diff: 0.06349219083786006
4 Prediction: 0 Odds diff: 0.1556992928187052
5 Prediction: 0 Odds diff: 0.1503442857000563
6 Prediction: 0 Odds diff: -0.11615185766685304
6 Prediction: 2 Odds diff: 0.13428070715495521
7 Prediction: 2 Odds diff: 0.4302469491958618
8 Prediction: 1 Odds diff: 0.27248744832144844
10 Prediction: 0 Odds diff: 0.3813650192636432
