In [1]:
import numpy as np
import random
import os
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

import pandas as pd
from scrapers import *
from keras.models import load_model
from sklearn.preprocessing import StandardScaler

DATA_PATH = 'data/'

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Predict upcoming games

To predict upcoming games we will need to:
1. Scrape upcoming fixtures with odds
2. Concatentate these onto results so far for season 19/20
3. Use the same functions previously to create data for our model

# Get EMA stats

The most tricky part is getting EMA stats for the upcoming fixtures, the league stats should be straight forward. 

In [58]:
# scraper = FixtureScraper()
# fixtures = scraper.get_fixtures()
# scraper.close_driver()

# fixtures.replace({'Man Utd': 'Man United',
#                   'C Palace': 'Crystal Palace',
#                   'Sheff Utd': 'Sheffield United',
#                   'Sheffield Utd': 'Sheffield United'},
#                    inplace=True)

In [133]:
df_ema = (pd.read_csv(os.path.join(DATA_PATH, 'all_seasons_joined.csv'))
                    .assign(Date=lambda df: pd.to_datetime(df.Date))
                    .pipe(lambda df: df.dropna(thresh=len(df) - 2, axis=1))  # Drop cols with NAs
                    .dropna(axis=0)  # Drop rows with NAs
                    .sort_values('season')
                    .append(fixtures, sort=True)
                    .reset_index(drop=True)
                    .assign(gameId=lambda df: list(df.index + 1))
                    )

In [134]:
df_ema.columns

Index(['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'B365A', 'B365D',
       'B365H', 'BWA', 'BWD', 'BWH', 'Date', 'Div', 'FTAG', 'FTHG', 'FTR',
       'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HTR', 'HY', 'HomeTeam',
       'Referee', 'Unnamed: 0', 'Unnamed: 0.1', 'VCA', 'VCD', 'VCH', 'season',
       'gameId'],
      dtype='object')

In [135]:
# Drop betting odds as we can use league stats function to get these
df_ema.drop(['B365H', 'B365D', 'B365A'], 1, inplace=True)

In [136]:
# Define a function which restructures our DataFrame
def create_multiline_df_stats(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'season', 'HomeTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
                       'HR', 'AR']
    
    away_stats_cols = ['Date', 'season', 'AwayTeam', 'FTAG', 'FTHG', 'HTAG', 'HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF', 'AC', 'HC', 'AY', 'HY',
                       'AR', 'HR']
    
    stats_cols_mapping = ['Date', 'season', 'Team', 'goalsFor', 'goalsAgainst', 'halfTimeGoalsFor', 
                          'halfTimeGoalsAgainst', 'shotsFor', 'shotsAgainst', 'shotsOnTargetFor',
                          'shotsOnTargetAgainst', 'freesFor', 'freesAgainst', 'cornersFor', 
                          'cornersAgainst', 'yellowsFor', 'yellowsAgainst', 'redsFor', 'redsAgainst']
    
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['gameId'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(homeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['gameId'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(homeGame=0), sort=True)
                    .sort_values(by='gameId') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [137]:
# Define a function which creates an EMA DataFrame from the stats DataFrame
def create_stats_features_ema(stats, span):
    # Create a restructured DataFrames so that we can calculate EMA
    multi_line_stats = create_multiline_df_stats(stats)
    
    # Create a copy of the DataFrame
    ema_features = multi_line_stats[['Date', 'season', 'gameId', 'Team', 'homeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multi_line_stats.drop(columns=['Date', 'season', 'gameId', 'Team', 'homeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multi_line_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

In [138]:
df_ema = create_stats_features_ema(df_ema, 50)
df_ema.tail()

Unnamed: 0,Date,season,gameId,Team,homeGame,cornersAgainst,cornersFor,freesAgainst,freesFor,goalsAgainst,...,halfTimeGoalsAgainst,halfTimeGoalsFor,redsAgainst,redsFor,shotsAgainst,shotsFor,shotsOnTargetAgainst,shotsOnTargetFor,yellowsAgainst,yellowsFor
9373,23/11/19,1920,4687,Man City,1,2.626776,8.111629,8.29266,9.177157,0.871916,...,0.487576,1.318577,0.027874,0.058707,6.882073,19.122113,2.884055,6.898211,1.440255,1.465864
9374,23/11/19,1920,4688,Man United,0,4.339675,5.604387,12.204488,11.287668,1.209943,...,0.462666,0.90608,0.06232,0.053829,11.777937,13.954955,4.247753,5.524457,2.099192,1.974357
9375,23/11/19,1920,4688,Sheffield United,1,6.293901,6.102289,8.126496,10.815063,0.782634,...,0.367666,0.399923,0.0,0.098825,11.459498,10.234758,3.604352,3.029771,1.304825,1.823662
9376,23/11/19,1920,4689,Aston Villa,1,6.814318,4.369647,11.553388,11.328192,1.762131,...,0.633122,0.423411,0.140659,0.084957,15.382232,11.239456,4.822071,3.983492,1.82082,1.855735
9377,23/11/19,1920,4689,Newcastle,0,6.224506,4.37521,9.477185,10.248687,1.396502,...,0.564611,0.57964,0.006388,0.102513,13.745526,11.076482,4.30394,3.538927,1.607436,1.556353


In [139]:
def restructure_stats_features(stats_features):
    non_features = ['homeGame', 'Team', 'gameId']

    stats_features_restructured = (stats_features.query('homeGame == 1')
                                    .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('homeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['gameId'])
                                    .dropna())
    return stats_features_restructured

df_ema = restructure_stats_features(df_ema)
df_ema.tail()

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,...,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
4684,23/11/19,1920,4685,Everton,1,4.570612,5.869241,11.008853,11.645733,1.415825,...,0.934468,0.428926,0.051606,0.049375,14.963195,10.938155,5.397906,3.575102,1.304914,1.583828
4685,23/11/19,1920,4686,Watford,1,5.563437,5.014713,9.240339,11.171406,1.739056,...,0.773973,0.573157,0.078689,0.007935,16.050848,9.889395,5.1494,3.340314,1.141406,1.80424
4686,23/11/19,1920,4687,Man City,1,2.626776,8.111629,8.29266,9.177157,0.871916,...,0.428059,0.936993,0.00523,0.011826,9.033066,16.434244,3.11812,5.816693,1.860391,1.582246
4687,23/11/19,1920,4688,Sheffield United,1,6.293901,6.102289,8.126496,10.815063,0.782634,...,0.462666,0.90608,0.06232,0.053829,11.777937,13.954955,4.247753,5.524457,2.099192,1.974357
4688,23/11/19,1920,4689,Aston Villa,1,6.814318,4.369647,11.553388,11.328192,1.762131,...,0.564611,0.57964,0.006388,0.102513,13.745526,11.076482,4.30394,3.538927,1.607436,1.556353


In [219]:
df_ema.drop(['f_DateHome', 'f_seasonHome', 'HomeTeam',
               'homeGame_x', 'f_yellowsAgainstAway', 'f_yellowsForAway',
               'f_halfTimeGoalsAgainstHome', 'f_halfTimeGoalsForHome','f_halfTimeGoalsAgainstAway', 'f_halfTimeGoalsForAway',
               'f_yellowsAgainstHome', 'f_yellowsForHome', 'f_DateAway', 'f_seasonAway', 
               'AwayTeam', 'homeGame_y'], 1, inplace=True)

In [220]:
df_ema.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

# Get League stats

To get the league stats we will use the same 'features' DataFrame that we used to create EMA stats. We can use the same function we used previously to create league data.

In [70]:
def get_points(goals_scored, goals_conceded):
    if goals_scored > goals_conceded: 
        points = 3
    elif goals_conceded == goals_scored:
        points = 1
    else:
        points = 9
    return points

def get_result(goals_scored, goals_conceded):
    if goals_scored > goals_conceded: 
        result = 'W'
    elif goals_conceded == goals_scored:
        result = 'D'
    else:
        result = 'L'
    return result


def get_last_result(team):
    """
    Helper function which takes a team-name as a string and returns goal stats from
    their most recent match
    """
    with open(DATA_PATH + 'season1920.csv') as f:
        for row in reversed(list(csv.reader(f))):
            if team == row[4]:
                goals_scored = int(row[6])
                goals_conceded = int(row[7])

                return(goals_scored, goals_conceded)
            elif team == row[5]:
                goals_scored = int(row[7])
                goals_conceded = int(row[6])

                return goals_scored, goals_conceded

In [71]:
def create_row(home_team, away_team, home_odds, draw_odds, away_odds, matchweek):
    # First get last result data
    home_goals_scored, home_goals_against = get_last_result(home_team)
    home_result = get_result(home_goals_scored, home_goals_against)
    home_points = get_points(home_goals_scored, home_goals_against)
    
    away_goals_scored, away_goals_against = get_last_result(away_team)
    away_result = get_result(away_goals_scored, away_goals_against)
    away_points = get_points(away_goals_scored, away_goals_against)
    
    # Combine this with team data we already have, I created a csv
    # using existing fucntions without scaling.   
    with open(os.path.join(DATA_PATH, 'season1920_data.csv')) as f:
        reader = csv.reader(f)
        next(reader, None)
        for row in reader:
            if home_team == row[3]:
                htp = (int(row[12]) + home_points) / matchweek
                hm1 = home_result
                hm2 = row[17]
                hm3 = row[18]
                htgd = (float(row[35]) + home_goals_scored - home_goals_against) / matchweek
                ht_prev_pos = int(row[28])
                ht_form_pts = int(row[33])
                
            elif away_team == row[3]:
                atp = (int(row[12]) + away_points) / matchweek
                am1 = away_result
                am2 = row[17]
                am3 = row[18]
                atgd = (float(row[35]) + away_goals_scored - away_goals_against) / matchweek
                at_prev_pos = int(row[28])
                at_form_pts = int(row[33])
                
            elif home_team == row[4]:
                htp = (int(row[13]) + home_points) / matchweek
                hm1 = home_result
                hm2 = row[22]
                hm3 = row[23]
                htgd = (float(row[36]) + home_goals_scored - home_goals_against) / matchweek
                ht_prev_pos = int(row[29])
                ht_form_pts = int(row[34])
            elif away_team == row[4]:
                atp = (int(row[13]) + away_points) / matchweek
                am1 = away_result
                am2 = row[22]
                am3 = row[23]
                atgd = (float(row[36]) + away_goals_scored - away_goals_against) / matchweek
                at_prev_pos = int(row[29])
                at_form_pts = int(row[34])

        diff_lp = ht_prev_pos - at_prev_pos
        diff_points = (atp - htp) / matchweek
        diff_form_points = (ht_form_pts - at_form_pts) / matchweek
        
        # Scale cols by matchweek
        scale_cols = [htgd, atgd, diff_points, diff_form_points, htp, atp]

            
        row = [htp, atp, home_odds, draw_odds, away_odds, 
               hm1, hm2, hm3, am1, am2, am3, htgd, atgd, 
               diff_points, diff_form_points, diff_lp]
        
        # print(row)
        return row

In [72]:
headers = ['HTP', 'ATP', 'B365H', 'B365D', 'B365A', 
           'HM1', 'HM2', 'HM3', 'AM1', 'AM2', 'AM3', 
           'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP']

with open(os.path.join(DATA_PATH, 'fixtures_league_data.csv'), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    for i, row in fixtures.iterrows():
        # Last item in row is MatchWeek, entering manually for now
        row = create_row(row.HomeTeam, row.AwayTeam, row.B365H, 
                         row.B365D, row.B365A, 13.0)
        writer.writerow(row)

In [73]:
df_ema.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

# Combine datasets


In [74]:
# Load league data from csv file
league_data = pd.read_csv('data/fixtures_league_data.csv')
league_data.drop(['HM1', 'HM2', 'HM3', 
                  'AM1', 'AM2', 'AM3'], 1, inplace=True)

In [75]:
league_data.columns

Index(['HTP', 'ATP', 'B365H', 'B365D', 'B365A', 'HTGD', 'ATGD', 'DiffPts',
       'DiffFormPts', 'DiffLP'],
      dtype='object')

In [76]:
# We only need upcoming fixtures from EMA stats
df_ema1920 = df_ema.tail(len(fixtures)).reset_index()

In [77]:
df = pd.concat([df_ema1920, league_data], axis=1)

In [78]:
df.drop(['index', 'gameId', ], 1, inplace=True)

In [79]:
df_ema1920

Unnamed: 0,index,gameId,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,...,f_freesAgainstAway,f_freesForAway,f_goalsAgainstAway,f_goalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway
0,4679,4680,5.253672,5.092266,10.98477,9.513928,1.578367,1.30516,0.012916,0.065,...,10.781989,10.460147,1.17605,1.807382,0.026828,0.105281,13.134934,13.62411,4.824346,4.911338
1,4680,4681,5.250959,6.607073,12.148222,10.658784,1.37141,1.74361,0.048748,0.06179,...,9.559583,10.832573,1.944528,1.076334,0.110849,0.105833,14.125585,12.467635,4.969314,4.235462
2,4681,4682,6.086491,5.305065,11.762375,8.821027,1.605923,1.35872,0.123098,0.021771,...,8.965712,11.314133,1.391347,1.221191,0.096645,0.094822,13.07009,11.909276,4.763435,4.164734
3,4682,4683,5.494016,3.980154,8.994444,11.943442,1.486001,1.058894,0.054871,0.140032,...,10.440937,10.13381,1.012814,1.708772,0.150553,0.09185,11.056777,12.93931,3.582671,4.531583
4,4683,4684,5.692033,5.060605,11.900044,11.244447,1.37315,1.050871,0.150077,0.020889,...,9.04244,8.963756,0.704484,2.36606,0.01566,0.035724,8.776648,16.165724,2.596261,6.239188
5,4684,4685,4.570612,5.869241,11.008853,11.645733,1.415825,1.310385,0.112579,0.113166,...,9.697092,10.476027,1.909196,0.977808,0.051606,0.049375,14.963195,10.938155,5.397906,3.575102
6,4685,4686,5.563437,5.014713,9.240339,11.171406,1.739056,1.080578,0.055694,0.103695,...,10.489977,9.836003,1.676825,1.165746,0.078689,0.007935,16.050848,9.889395,5.1494,3.340314
7,4686,4687,2.626776,8.111629,8.29266,9.177157,0.871916,2.752627,0.027874,0.058707,...,11.677604,9.031427,1.157174,1.93947,0.00523,0.011826,9.033066,16.434244,3.11812,5.816693
8,4687,4688,6.293901,6.102289,8.126496,10.815063,0.782634,1.113173,0.0,0.098825,...,12.204488,11.287668,1.209943,1.612689,0.06232,0.053829,11.777937,13.954955,4.247753,5.524457
9,4688,4689,6.814318,4.369647,11.553388,11.328192,1.762131,0.969415,0.140659,0.084957,...,9.477185,10.248687,1.396502,0.970187,0.006388,0.102513,13.745526,11.076482,4.30394,3.538927


In [80]:
league_data

Unnamed: 0,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,1.692308,1.076923,4.1,4.2,1.75,-0.307692,-0.076923,-0.047337,-0.230769,15
1,2.0,1.307692,1.45,4.75,6.5,-0.307692,-1.230769,-0.053254,0.384615,-11
2,1.923077,1.230769,2.75,3.3,2.6,-0.076923,0.153846,-0.053254,-0.230769,-6
3,1.846154,2.0,3.75,3.5,2.0,-0.538462,1.769231,0.011834,-0.230769,6
4,1.846154,2.615385,7.5,4.75,1.4,-0.615385,1.307692,0.059172,-0.461538,14
5,1.076923,1.230769,1.36,5.1,8.0,-0.307692,-1.230769,0.011834,0.230769,-10
6,0.615385,1.153846,2.3,3.4,3.1,-0.769231,-0.076923,0.04142,-0.076923,7
7,2.615385,2.0,1.44,5.0,6.5,1.153846,1.230769,-0.047337,-0.230769,-4
8,1.307692,1.230769,3.5,3.25,2.2,0.307692,0.153846,-0.005917,0.230769,16
9,1.538462,1.153846,2.05,3.4,3.7,-0.153846,-0.384615,-0.029586,0.0,8


# Make Predictions

Now that we have prepared the data we can load our model and make predictions.

In [81]:
scaler = StandardScaler()
X = scaler.fit_transform(df)

In [82]:
model = load_model('17Nov19.h5')

In [83]:
predictions = model.predict(X)

In [84]:
predictions

array([[0.29486272, 0.35174876, 0.35338855],
       [0.89924085, 0.09371573, 0.00704343],
       [0.5833117 , 0.27206782, 0.14462055],
       [0.01344349, 0.13914071, 0.8474158 ],
       [0.00559315, 0.10550558, 0.88890123],
       [0.74585706, 0.18754365, 0.06659932],
       [0.07383332, 0.2220051 , 0.7041616 ],
       [0.97624004, 0.01938005, 0.00437993],
       [0.7944554 , 0.11802914, 0.08751538],
       [0.69937897, 0.22978687, 0.07083414]], dtype=float32)

In [85]:
for i in range(len(X)):
    pred = np.argmax(predictions[i])
    if pred == 0:
        odds_diff = predictions[i][pred] - (1/league_data['B365H'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)
    elif pred == 1:
        odds_diff = predictions[i][pred] - (1/league_data['B365D'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)
    else:
        odds_diff = predictions[i][pred] - (1/league_data['B365A'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)

0 Prediction: 2 Odds diff: -0.21804002353123253
1 Prediction: 0 Odds diff: 0.20958567898848957
2 Prediction: 0 Odds diff: 0.2196753133427013
3 Prediction: 2 Odds diff: 0.3474158048629761
4 Prediction: 2 Odds diff: 0.1746155193873814
5 Prediction: 0 Odds diff: 0.010562942308538159
6 Prediction: 2 Odds diff: 0.3815809392159985
7 Prediction: 0 Odds diff: 0.2817955944273207
8 Prediction: 0 Odds diff: 0.5087411233357021
9 Prediction: 0 Odds diff: 0.21157408923637572


# Sanity check

To try to make sure we have correct data we will make predictions on matches this season that have already happenend and then compare these do our prediction we made previously at ~64%.

To do this we need to first combine EMA_stats with league stats, similar to above process.

In [201]:
league_data = pd.read_csv('data/league_data.csv')
league_data.drop([ 'Date', 'HomeTeam', 'AwayTeam', 
                  'gameId', 'HTGS', 'ATGS', 'HTGC',
                  'ATGC', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 
                  'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPts',
                 'ATFormPts', 'HTFormPtsStr', 'ATFormPtsStr'], 1, inplace=True)

In [202]:
league_data = league_data.loc[league_data['season'] == 1920].reset_index(drop=True)
league_data.drop([10, 30], inplace=True)
league_data.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,season,FTR,HTP,ATP,B365H,B365D,B365A,MW,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,4560,1920,H,0.000000,0.000000,1.14,10.00,19.00,1.0,0.000000,0.000000,0.000000,0.000000,-14
1,4561,1920,A,0.000000,0.000000,12.00,6.50,1.22,1.0,0.000000,0.000000,0.000000,0.000000,17
2,4562,1920,D,0.000000,0.000000,1.95,3.60,3.60,1.0,0.000000,0.000000,0.000000,0.000000,-6
3,4563,1920,H,0.000000,0.000000,2.62,3.20,2.75,1.0,0.000000,0.000000,0.000000,0.000000,-10
4,4564,1920,D,0.000000,0.000000,3.00,3.25,2.37,1.0,0.000000,0.000000,0.000000,0.000000,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,4675,1920,D,1.083333,1.333333,1.57,4.00,6.00,12.0,-0.083333,0.333333,-0.250000,-0.250000,-15
114,4676,1920,H,1.916667,1.416667,2.00,3.75,3.50,12.0,1.750000,-0.166667,0.500000,0.500000,3
115,4677,1920,H,1.083333,1.250000,1.60,3.75,6.50,12.0,0.000000,-0.416667,-0.166667,-0.333333,-13
116,4678,1920,H,1.083333,0.916667,1.90,3.50,4.20,12.0,0.083333,-0.083333,0.166667,0.166667,0


In [203]:
league_data

Unnamed: 0.1,Unnamed: 0,season,FTR,HTP,ATP,B365H,B365D,B365A,MW,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,4560,1920,H,0.000000,0.000000,1.14,10.00,19.00,1.0,0.000000,0.000000,0.000000,0.000000,-14
1,4561,1920,A,0.000000,0.000000,12.00,6.50,1.22,1.0,0.000000,0.000000,0.000000,0.000000,17
2,4562,1920,D,0.000000,0.000000,1.95,3.60,3.60,1.0,0.000000,0.000000,0.000000,0.000000,-6
3,4563,1920,H,0.000000,0.000000,2.62,3.20,2.75,1.0,0.000000,0.000000,0.000000,0.000000,-10
4,4564,1920,D,0.000000,0.000000,3.00,3.25,2.37,1.0,0.000000,0.000000,0.000000,0.000000,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,4675,1920,D,1.083333,1.333333,1.57,4.00,6.00,12.0,-0.083333,0.333333,-0.250000,-0.250000,-15
116,4676,1920,H,1.916667,1.416667,2.00,3.75,3.50,12.0,1.750000,-0.166667,0.500000,0.500000,3
117,4677,1920,H,1.083333,1.250000,1.60,3.75,6.50,12.0,0.000000,-0.416667,-0.166667,-0.333333,-13
118,4678,1920,H,1.083333,0.916667,1.90,3.50,4.20,12.0,0.083333,-0.083333,0.166667,0.166667,0


In [204]:
league_data.columns

Index(['Unnamed: 0', 'season', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D', 'B365A',
       'MW', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [205]:
league_data.drop(['MW', 'Unnamed: 0', 'season'], 1, inplace=True)
# league_data.reset_index(inplace=True)

In [206]:
league_data.head()

Unnamed: 0,FTR,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,H,0.0,0.0,1.14,10.0,19.0,0.0,0.0,0.0,0.0,-14
1,A,0.0,0.0,12.0,6.5,1.22,0.0,0.0,0.0,0.0,17
2,D,0.0,0.0,1.95,3.6,3.6,0.0,0.0,0.0,0.0,-6
3,H,0.0,0.0,2.62,3.2,2.75,0.0,0.0,0.0,0.0,-10
4,D,0.0,0.0,3.0,3.25,2.37,0.0,0.0,0.0,0.0,10


In [207]:
len(league_data.columns)

11

In [208]:
df_ema.columns

Index(['f_DateHome', 'f_seasonHome', 'gameId', 'HomeTeam', 'homeGame_x',
       'f_cornersAgainstHome', 'f_cornersForHome', 'f_freesAgainstHome',
       'f_freesForHome', 'f_goalsAgainstHome', 'f_goalsForHome',
       'f_halfTimeGoalsAgainstHome', 'f_halfTimeGoalsForHome',
       'f_redsAgainstHome', 'f_redsForHome', 'f_shotsAgainstHome',
       'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_yellowsAgainstHome', 'f_yellowsForHome',
       'f_DateAway', 'f_seasonAway', 'AwayTeam', 'homeGame_y',
       'f_cornersAgainstAway', 'f_cornersForAway', 'f_freesAgainstAway',
       'f_freesForAway', 'f_goalsAgainstAway', 'f_goalsForAway',
       'f_halfTimeGoalsAgainstAway', 'f_halfTimeGoalsForAway',
       'f_redsAgainstAway', 'f_redsForAway', 'f_shotsAgainstAway',
       'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'f_yellowsAgainstAway', 'f_yellowsForAway'],
      dtype='object')

In [209]:
len(df_ema)

4632

In [221]:
ema1920 = df_ema.loc[(df_ema['gameId'] > 4559) & (df_ema['gameId'] < 4680)].reset_index()

In [222]:
len(ema1920)

118

In [223]:
len(league_data)

118

In [224]:
df1920 = pd.concat([ema1920, league_data], axis=1)

In [225]:
df1920.columns

Index(['index', 'gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D',
       'B365A', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [226]:
df1920.drop(['index', 'gameId'], 1, inplace=True)

In [227]:
df1920.columns

Index(['f_cornersAgainstHome', 'f_cornersForHome', 'f_freesAgainstHome',
       'f_freesForHome', 'f_goalsAgainstHome', 'f_goalsForHome',
       'f_redsAgainstHome', 'f_redsForHome', 'f_shotsAgainstHome',
       'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D',
       'B365A', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [228]:
X = df1920.drop(['FTR'], 1)
y_true = df1920['FTR']

In [229]:
X.shape[1]

34

In [230]:
def transform_results(results):
    transformed = []
    for i in range(len(results)):
        if results[i] == 'H':
            transformed.append(0)
        elif results[i] == 'A':
            transformed.append(2)
        else:
            transformed.append(1)
    return np.array(transformed)

y_true = transform_results(y_true)

In [231]:
y_true

array([0, 2, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 0, 0, 2, 1, 0, 1, 1, 0, 2,
       2, 2, 2, 2, 0, 2, 2, 1, 1, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 2,
       0, 2, 0, 0, 1, 1, 2, 0, 0, 2, 0, 1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 0,
       0, 2, 0, 1, 0, 0, 0, 2, 1, 2, 0, 2, 2, 0, 0, 0, 1, 0, 0, 1, 1, 2,
       1, 0, 2, 0, 0, 1, 1, 2, 1, 1, 0, 2, 0, 1, 2, 0, 0, 0, 2, 2, 2, 1,
       2, 0, 0, 0, 2, 1, 0, 0, 0, 0])

In [232]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [233]:
model = load_model('17Nov19.h5')


_, acc = model.evaluate(X, y_true)
acc



0.6083333492279053

In [234]:
y_preds = model.predict(X)

funds = 100
wager = 10
favourites = 0
no_bets = 0
min_diff = 0.03

for i in range(len(X)):
    prediction = np.argmax(y_preds[i])
    print('\nPrediction', prediction)
    print('Actual', y_true[i])
    print('Favourite', np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]))
    print('Prediction proba', y_preds[i])
    print('Home, Draw and Away odds', df1920['B365H'][i],       
          df1920['B365D'][i], df1920['B365A'][i])
    
        

    if prediction == 0:
        odds_diff = y_preds[i][prediction] - (1/df1920['B365H'][i])
        # If odds_diff positive place bet
        if odds_diff > min_diff:
            if prediction == np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]):
                favourites +=1
                
            if  prediction == y_true[i]:
                funds += (wager * df1920['B365H'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    elif prediction == 1:
        odds_diff = y_preds[i][prediction] - (1/df1920['B365D'][i])
        if odds_diff > min_diff:
            if prediction == np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]):
                favourites +=1
            if  prediction == y_true[i]:
                funds +=( wager * df1920['B365D'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    else:
        odds_diff = y_preds[i][prediction] - (1/df1920['B365A'][i])
        if odds_diff >  min_diff:
            if prediction == np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]):
                favourites +=1
            if  prediction == y_true[i]:
                funds += (wager * df1920['B365A'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1

        
    
    
    print('Funds', funds)
    
print(f'Betted on favourite {favourites} times out of {len(X)} matches.')
print(f'No bet placed {no_bets} times')
         


Prediction 0
Actual 0
Favourite 0
Prediction proba [0.5430389  0.25684983 0.20011123]
Home, Draw and Away odds 1.14 10.0 19.0
Funds 100

Prediction 2
Actual 2
Favourite 2
Prediction proba [0.08087105 0.22293802 0.69619095]
Home, Draw and Away odds 12.0 6.5 1.22
Funds 100

Prediction 0
Actual 1
Favourite 0
Prediction proba [0.37600842 0.30000865 0.32398286]
Home, Draw and Away odds 1.95 3.6 3.6
Funds 100

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.57025194 0.30299962 0.12674843]
Home, Draw and Away odds 2.62 3.2 2.75
Funds 116.2

Prediction 2
Actual 1
Favourite 2
Prediction proba [0.3167887  0.26941282 0.4137985 ]
Home, Draw and Away odds 3.0 3.25 2.37
Funds 116.2

Prediction 0
Actual 2
Favourite 0
Prediction proba [0.37060124 0.29953223 0.3298665 ]
Home, Draw and Away odds 1.9 3.4 4.0
Funds 116.2

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.35839924 0.32007053 0.3215302 ]
Home, Draw and Away odds 1.3 5.25 10.0
Funds 116.2

Prediction 0
Actual 1
Favourite 0
Predict

In [51]:
headers = ['f_cornersAgainstHome', 'f_cornersForHome', 'f_freesAgainstHome',
       'f_freesForHome', 'f_goalsAgainstHome', 'f_goalsForHome',
       'f_redsAgainstHome', 'f_redsForHome', 'f_shotsAgainstHome',
       'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'HTP', 'ATP', 'B365H', 'B365D',
       'B365A', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP']
X_testdf = pd.DataFrame(np.load('X_test.npy'), columns=headers)

In [52]:
Xdf = pd.DataFrame(X, columns=headers)

In [53]:
X_testdf.describe()

Unnamed: 0,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,...,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
count,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,...,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0
mean,-9.803834e-16,-1.034954e-15,1.110223e-16,-1.885497e-15,-4.120997e-16,3.951641e-16,3.32126e-16,-1.138449e-16,-3.067226e-16,1.264525e-15,...,1.881734e-16,-9.596843000000001e-17,3.481208e-17,8.467803000000001e-17,-3.8105110000000004e-17,-9.40867e-18,2.069907e-17,-7.526936e-18,-4.469118e-17,1.4113000000000002e-17
std,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,...,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264
min,-2.651698,-1.867422,-3.740896,-2.391947,-2.281603,-2.286785,-1.301356,-1.493707,-2.511603,-1.491278,...,-1.681081,-1.628375,-0.7393457,-0.6997541,-0.774073,-2.267245,-2.482586,-2.444337,-2.54,-1.852559
25%,-0.4630868,-0.6672563,-0.6466079,-0.9385521,-0.5124221,-0.7049074,-0.8190233,-0.8160852,-0.4117919,-0.6166028,...,-0.6080218,-0.5516837,-0.4927866,-0.5337934,-0.5208719,-0.5284336,-0.5258274,-0.5722641,-0.557791,-0.6454197
50%,0.1828175,-0.08307034,-0.03821255,0.2685508,0.02620386,-0.3485719,-0.08429974,-0.09866711,0.2178389,-0.2828743,...,0.00766787,-0.01333816,-0.2873206,-0.3816627,-0.3218917,-0.04052848,0.04017724,0.01891675,0.008554501,0.01301999
75%,0.6451789,0.4945008,0.8621345,0.6766743,0.7968037,0.6409257,0.4420337,0.7526667,0.5824583,0.3175506,...,0.5705841,0.5923005,0.006202119,0.2130298,0.002921717,0.4162338,0.492981,0.5115674,0.4333136,0.8909396
max,2.066293,3.026837,1.499792,2.138627,1.967815,2.946465,3.071933,2.433209,2.134145,3.096948,...,2.751885,2.678389,4.71235,4.776949,4.312455,2.928426,2.951058,2.38364,2.727013,1.878599


In [54]:
Xdf.describe()

Unnamed: 0,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,...,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
count,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,...,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0
mean,-1.027427e-15,-1.059416e-15,2.916688e-17,-1.836572e-15,-3.970459e-16,3.650564e-16,3.067226e-16,-1.138449e-16,4.74197e-16,1.162912e-15,...,7.903283000000001e-17,5.6452020000000004e-18,1.533613e-16,-1.119632e-16,-1.110223e-16,7.526936e-18,-1.1290400000000001e-17,1.975821e-17,4.704334999999999e-19,-8.938236e-18
std,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,...,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264
min,-2.651698,-1.867422,-3.740896,-2.391947,-2.281603,-2.286785,-1.301356,-1.493707,-2.511603,-1.491278,...,-1.749821,-1.697553,-0.7540051,-0.6953932,-0.785161,-2.2712,-2.468874,-2.439951,-2.534558,-1.887751
25%,-0.4630868,-0.6672563,-0.6466079,-0.9385521,-0.5124221,-0.7049074,-0.8190233,-0.8160852,-0.4117919,-0.6166028,...,-0.5999776,-0.5785902,-0.4931491,-0.5214958,-0.5235452,-0.5360211,-0.5443738,-0.5688403,-0.5537888,-0.6644356
50%,0.1828175,-0.08307034,-0.03821255,0.2685508,0.02620386,-0.3485719,-0.08429974,-0.09866711,0.2178389,-0.2828743,...,-0.02505606,-0.04855505,-0.2757691,-0.4055642,-0.3179521,-0.04913508,0.048711,0.02203668,0.01214533,0.002827386
75%,0.6451789,0.4945008,0.8621345,0.6766743,0.7968037,0.6409257,0.4420337,0.7526667,0.5824583,0.3175506,...,0.5498655,0.5698193,0.03477378,0.2393055,0.01765605,0.4377509,0.5005852,0.5144342,0.4365959,0.8925114
max,2.066293,3.026837,1.499792,2.138627,1.967815,2.946465,3.071933,2.433209,2.134145,3.096948,...,2.777687,2.699775,5.013811,5.043222,4.47041,2.913618,2.953616,2.385545,2.728629,1.893406


In [122]:
test_df = pd.read_csv('data/all_seasons_joined.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [123]:
test_df = test_df.loc[test_df['season'] == 1920]

In [124]:
len(test_df)

120

In [130]:
test_df[30:31]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
4591,4591,30,E0,31/08/2019,Southampton,Man United,1.0,1.0,D,0.0,...,1.91,0.5,1.94,1.99,1.94,1.99,1.95,2.0,1.92,1.97
