In [2]:
import os
import pandas as pd
from scrapers import *
from keras.models import load_model
from sklearn.preprocessing import StandardScaler

DATA_PATH = 'data/'

# Predict upcoming games

To predict upcoming games we will need to:
1. Scrape upcoming fixtures with odds
2. Concatentate these onto results so far for season 19/20
3. Use the same functions previously to create data for our model

# Get EMA stats

The most tricky part is getting EMA stats for the upcoming fixtures, the league stats should be straight forward. 

In [3]:
scraper = FixtureScraper()
fixtures = scraper.get_fixtures()
scraper.close_driver()

fixtures.replace({'Man Utd': 'Man United',
                  'C Palace': 'Crystal Palace',
                  'Sheff Utd': 'Sheffield United',
                  'Sheffield Utd': 'Sheffield United'},
                   inplace=True)

In [14]:
df_ema = (pd.read_csv(os.path.join(DATA_PATH, 'all_seasons_joined.csv'))
                    .assign(Date=lambda df: pd.to_datetime(df.Date))
                    .pipe(lambda df: df.dropna(thresh=len(df) - 2, axis=1))  # Drop cols with NAs
                    .dropna(axis=0)  # Drop rows with NAs
                    .sort_values('Unnamed: 0')
                    .append(fixtures, sort=True)
                    .reset_index(drop=True)
                    .assign(gameId=lambda df: list(df.index + 1))
                    )

In [17]:
df_ema.tail()

Unnamed: 0.2,AC,AF,AR,AS,AST,AY,AwayTeam,B365A,B365D,B365H,...,HY,HomeTeam,Referee,Unnamed: 0,Unnamed: 0.1,VCA,VCD,VCH,season,gameId
4694,,,,,,,Watford,3.5,3.4,2.05,...,,Southampton,,,,,,,1920,4695
4695,,,,,,,Arsenal,1.8,3.8,4.0,...,,Norwich,,,,,,,1920,4696
4696,,,,,,,Sheffield United,4.0,3.4,1.9,...,,Wolves,,,,,,,1920,4697
4697,,,,,,,Aston Villa,6.5,4.5,1.44,...,,Man United,,,,,,,1920,4698
4698,,,,,,,Everton,5.0,3.8,1.65,...,,Leicester,,,,,,,1920,4699


In [18]:
# Drop betting odds as we can use league stats function to get these
df_ema.drop(['B365H', 'B365D', 'B365A'], 1, inplace=True)

In [19]:
# Define a function which restructures our DataFrame
def create_multiline_df_stats(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'season', 'HomeTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
                       'HR', 'AR']
    
    away_stats_cols = ['Date', 'season', 'AwayTeam', 'FTAG', 'FTHG', 'HTAG', 'HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF', 'AC', 'HC', 'AY', 'HY',
                       'AR', 'HR']
    
    stats_cols_mapping = ['Date', 'season', 'Team', 'goalsFor', 'goalsAgainst', 'halfTimeGoalsFor', 
                          'halfTimeGoalsAgainst', 'shotsFor', 'shotsAgainst', 'shotsOnTargetFor',
                          'shotsOnTargetAgainst', 'freesFor', 'freesAgainst', 'cornersFor', 
                          'cornersAgainst', 'yellowsFor', 'yellowsAgainst', 'redsFor', 'redsAgainst']
    
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['gameId'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(homeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['gameId'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(homeGame=0), sort=True)
                    .sort_values(by='gameId') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [20]:
# Define a function which creates an EMA DataFrame from the stats DataFrame
def create_stats_features_ema(stats, span):
    # Create a restructured DataFrames so that we can calculate EMA
    multi_line_stats = create_multiline_df_stats(stats)
    
    # Create a copy of the DataFrame
    ema_features = multi_line_stats[['Date', 'season', 'gameId', 'Team', 'homeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multi_line_stats.drop(columns=['Date', 'season', 'gameId', 'Team', 'homeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multi_line_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

In [21]:
df_ema = create_stats_features_ema(df_ema, 50)
df_ema.tail()

Unnamed: 0,Date,season,gameId,Team,homeGame,cornersAgainst,cornersFor,freesAgainst,freesFor,goalsAgainst,...,halfTimeGoalsAgainst,halfTimeGoalsFor,redsAgainst,redsFor,shotsAgainst,shotsFor,shotsOnTargetAgainst,shotsOnTargetFor,yellowsAgainst,yellowsFor
9393,30/11/19,1920,4697,Wolves,1,5.337415,5.232778,9.18389,10.95792,1.325211,...,0.664515,0.486696,0.123622,0.083231,12.844085,11.969167,4.395726,4.095568,1.840712,1.854479
9394,30/11/19,1920,4698,Aston Villa,0,6.824382,4.381318,11.51547,11.003606,1.945397,...,0.616616,0.461958,0.134666,0.099228,15.227764,11.173344,4.967037,4.054325,1.720029,1.758939
9395,30/11/19,1920,4698,Man United,1,4.430731,5.494909,11.91383,10.79416,1.240951,...,0.438717,0.801489,0.042653,0.045524,11.435094,14.120204,4.188806,5.37634,2.085369,1.981248
9396,30/11/19,1920,4699,Leicester,1,4.738357,6.083126,10.369843,9.637158,1.014663,...,0.566415,0.570298,0.137931,0.061986,10.755939,13.98075,3.479016,5.0507,1.692849,1.236327
9397,30/11/19,1920,4699,Everton,0,4.661124,6.041594,10.574664,11.623101,1.322742,...,0.428907,0.623753,0.094323,0.104197,9.98769,13.30182,3.673723,4.596235,1.973455,1.517746


In [22]:
def restructure_stats_features(stats_features):
    non_features = ['homeGame', 'Team', 'gameId']

    stats_features_restructured = (stats_features.query('homeGame == 1')
                                    .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('homeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['gameId'])
                                    .dropna())
    return stats_features_restructured

df_ema = restructure_stats_features(df_ema)
df_ema.tail()

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,...,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
4694,30/11/19,1920,4695,Southampton,1,6.122353,4.885765,9.429858,11.235308,1.987825,...,0.802975,0.373721,0.039294,0.107389,13.166968,11.909246,5.111947,3.678857,1.572426,2.055737
4695,30/11/19,1920,4696,Norwich,1,6.25627,4.943509,10.230639,10.486938,1.867071,...,0.656924,0.72912,0.039354,0.078882,14.52167,12.727828,5.01615,4.547976,1.961493,2.154146
4696,30/11/19,1920,4697,Wolves,1,5.337415,5.232778,9.18389,10.95792,1.325211,...,0.285518,0.52,0.0,0.070219,11.418729,10.575018,3.607355,3.62812,1.460244,1.775791
4697,30/11/19,1920,4698,Man United,1,4.430731,5.494909,11.91383,10.79416,1.240951,...,0.616616,0.461958,0.134666,0.099228,15.227764,11.173344,4.967037,4.054325,1.720029,1.758939
4698,30/11/19,1920,4699,Leicester,1,4.738357,6.083126,10.369843,9.637158,1.014663,...,0.428907,0.623753,0.094323,0.104197,9.98769,13.30182,3.673723,4.596235,1.973455,1.517746


In [23]:
df_ema.drop(['f_DateHome', 'f_seasonHome', 'HomeTeam',
               'homeGame_x', 'f_yellowsAgainstAway', 'f_yellowsForAway',
               'f_halfTimeGoalsAgainstHome', 'f_halfTimeGoalsForHome','f_halfTimeGoalsAgainstAway', 'f_halfTimeGoalsForAway',
               'f_yellowsAgainstHome', 'f_yellowsForHome', 'f_DateAway', 'f_seasonAway', 
               'AwayTeam', 'homeGame_y'], 1, inplace=True)

In [24]:
df_ema.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

# Get League stats

To get the league stats we will use the same 'features' DataFrame that we used to create EMA stats. We can use the same function we used previously to create league data.

In [25]:
def get_points(goals_scored, goals_conceded):
    if goals_scored > goals_conceded: 
        points = 3
    elif goals_conceded == goals_scored:
        points = 1
    else:
        points = 9
    return points

def get_result(goals_scored, goals_conceded):
    if goals_scored > goals_conceded: 
        result = 'W'
    elif goals_conceded == goals_scored:
        result = 'D'
    else:
        result = 'L'
    return result


def get_last_result(team):
    """
    Helper function which takes a team-name as a string and returns goal stats from
    their most recent match
    """
    with open(DATA_PATH + 'season1920.csv') as f:
        for row in reversed(list(csv.reader(f))):
            if team == row[4]:
                goals_scored = int(row[6])
                goals_conceded = int(row[7])

                return(goals_scored, goals_conceded)
            elif team == row[5]:
                goals_scored = int(row[7])
                goals_conceded = int(row[6])

                return goals_scored, goals_conceded

In [26]:
def create_row(home_team, away_team, home_odds, draw_odds, away_odds, matchweek):
    # First get last result data
    home_goals_scored, home_goals_against = get_last_result(home_team)
    home_result = get_result(home_goals_scored, home_goals_against)
    home_points = get_points(home_goals_scored, home_goals_against)
    
    away_goals_scored, away_goals_against = get_last_result(away_team)
    away_result = get_result(away_goals_scored, away_goals_against)
    away_points = get_points(away_goals_scored, away_goals_against)
    
    # Combine this with team data we already have, I created a csv
    # using existing fucntions without scaling.   
    with open(os.path.join(DATA_PATH, 'season1920_data.csv')) as f:
        reader = csv.reader(f)
        next(reader, None)
        for row in reader:
            if home_team == row[3]:
                htp = (int(row[12]) + home_points) / matchweek
                hm1 = home_result
                hm2 = row[17]
                hm3 = row[18]
                htgd = (float(row[35]) + home_goals_scored - home_goals_against) / matchweek
                ht_prev_pos = int(row[28])
                ht_form_pts = int(row[33])
                
            elif away_team == row[3]:
                atp = (int(row[12]) + away_points) / matchweek
                am1 = away_result
                am2 = row[17]
                am3 = row[18]
                atgd = (float(row[35]) + away_goals_scored - away_goals_against) / matchweek
                at_prev_pos = int(row[28])
                at_form_pts = int(row[33])
                
            elif home_team == row[4]:
                htp = (int(row[13]) + home_points) / matchweek
                hm1 = home_result
                hm2 = row[22]
                hm3 = row[23]
                htgd = (float(row[36]) + home_goals_scored - home_goals_against) / matchweek
                ht_prev_pos = int(row[29])
                ht_form_pts = int(row[34])
            elif away_team == row[4]:
                atp = (int(row[13]) + away_points) / matchweek
                am1 = away_result
                am2 = row[22]
                am3 = row[23]
                atgd = (float(row[36]) + away_goals_scored - away_goals_against) / matchweek
                at_prev_pos = int(row[29])
                at_form_pts = int(row[34])

        diff_lp = ht_prev_pos - at_prev_pos
        diff_points = (atp - htp) / matchweek
        diff_form_points = (ht_form_pts - at_form_pts) / matchweek
          
        row = [htp, atp, home_odds, draw_odds, away_odds, 
               hm1, hm2, hm3, am1, am2, am3, htgd, atgd, 
               diff_points, diff_form_points, diff_lp]
        
        # print(row)
        return row

In [27]:
headers = ['HTP', 'ATP', 'B365H', 'B365D', 'B365A', 
           'HM1', 'HM2', 'HM3', 'AM1', 'AM2', 'AM3', 
           'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP']

with open(os.path.join(DATA_PATH, 'fixtures_league_data.csv'), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    for i, row in fixtures.iterrows():
        # Last item in row is MatchWeek, entering manually for now
        row = create_row(row.HomeTeam, row.AwayTeam, row.B365H, 
                         row.B365D, row.B365A, 13.0)
        writer.writerow(row)

In [37]:
len(df_ema.columns)

25

# Combine datasets


In [29]:
# Load league data from csv file
league_data = pd.read_csv('data/fixtures_league_data.csv')
league_data.drop(['HM1', 'HM2', 'HM3', 
                  'AM1', 'AM2', 'AM3'], 1, inplace=True)

In [36]:
len(league_data.columns)

10

In [35]:
len(league_data)

10

In [31]:
# We only need upcoming fixtures from EMA stats
df_ema1920 = df_ema.tail(len(fixtures)).reset_index()

In [38]:
df = pd.concat([df_ema1920, league_data], axis=1)

In [39]:
df.drop(['index', 'gameId', ], 1, inplace=True)

In [42]:
df

Unnamed: 0,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,...,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,6.072016,4.124099,9.295369,10.01256,1.399784,1.218812,0.015796,0.088194,13.132669,11.460488,...,1.0,2.153846,13.0,7.5,1.16,-0.461538,1.384615,0.088757,-0.384615,9
1,5.045049,5.40636,10.505461,9.841266,1.205957,1.661953,0.026282,0.124352,12.718355,13.88848,...,1.230769,1.923077,1.36,5.0,7.5,0.0,-0.076923,0.053254,-0.076923,-9
2,3.59274,5.717618,10.775021,8.962049,1.193844,1.813473,0.00611,0.007749,9.304298,16.100898,...,2.461538,1.692308,1.25,5.25,13.0,1.0,-0.153846,-0.059172,1.0,-13
3,4.05936,6.469869,9.174339,8.413665,0.754456,2.318409,0.006359,0.023234,8.950306,15.711005,...,2.615385,1.846154,1.2,6.5,13.0,1.230769,-0.538462,-0.059172,0.307692,-11
4,6.159495,4.639604,10.166842,9.872408,1.525016,1.345427,0.080069,0.013602,15.161468,10.203643,...,1.153846,1.846154,2.25,3.2,3.25,-0.076923,-0.538462,0.053254,-0.230769,-11
5,6.122353,4.885765,9.429858,11.235308,1.987825,1.134511,0.104997,0.102143,14.266798,11.997247,...,0.692308,1.076923,2.05,3.4,3.5,-1.153846,-1.153846,0.029586,-0.153846,3
6,6.25627,4.943509,10.230639,10.486938,1.867071,0.932813,0.066865,0.032134,14.833278,11.177506,...,0.769231,1.384615,4.0,3.8,1.8,-0.923077,-0.153846,0.047337,-0.384615,12
7,5.337415,5.232778,9.18389,10.95792,1.325211,1.305084,0.123622,0.083231,12.844085,11.969167,...,1.230769,1.307692,1.9,3.4,4.0,0.153846,0.307692,0.005917,0.076923,0
8,4.430731,5.494909,11.91383,10.79416,1.240951,1.531578,0.042653,0.045524,11.435094,14.120204,...,1.076923,0.923077,1.44,4.5,6.5,0.0,-0.076923,-0.011834,-0.153846,-16
9,4.738357,6.083126,10.369843,9.637158,1.014663,1.837836,0.137931,0.061986,10.755939,13.98075,...,2.0,1.538462,1.65,3.8,5.0,1.769231,-0.538462,-0.035503,0.615385,1


# Make Predictions

Now that we have prepared the data we can load our model and make predictions.

In [43]:
scaler = StandardScaler()
X = scaler.fit_transform(df)

In [44]:
model = load_model('25Nov19.h5')




In [45]:
predictions = model.predict(X)

In [46]:
predictions

array([[8.6096564e-04, 2.4032729e-02, 9.7510624e-01],
       [4.9334839e-01, 2.8707755e-01, 2.1957402e-01],
       [8.6348230e-01, 1.2675080e-01, 9.7668786e-03],
       [9.8902148e-01, 1.0151096e-02, 8.2739565e-04],
       [5.9472322e-01, 2.8386506e-01, 1.2141176e-01],
       [3.7121263e-01, 3.6042604e-01, 2.6836139e-01],
       [5.6666505e-02, 2.4153242e-01, 7.0180106e-01],
       [4.2421509e-02, 2.7672592e-01, 6.8085259e-01],
       [2.6983768e-01, 4.9453825e-01, 2.3562405e-01],
       [9.8741376e-01, 1.1324852e-02, 1.2613848e-03]], dtype=float32)

In [47]:
for i in range(len(X)):
    pred = np.argmax(predictions[i])
    if pred == 0:
        odds_diff = predictions[i][pred] - (1/league_data['B365H'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)
    elif pred == 1:
        odds_diff = predictions[i][pred] - (1/league_data['B365D'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)
    else:
        odds_diff = predictions[i][pred] - (1/league_data['B365A'][i])
        print(i, 'Prediction:', pred, 'Odds diff:', odds_diff)

0 Prediction: 2 Odds diff: 0.11303727380160622
1 Prediction: 0 Odds diff: -0.24194572778309087
2 Prediction: 0 Odds diff: 0.06348229646682735
3 Prediction: 0 Odds diff: 0.1556881467501322
4 Prediction: 0 Odds diff: 0.15027878019544816
5 Prediction: 0 Odds diff: -0.11659224658477602
6 Prediction: 2 Odds diff: 0.14624550607469344
7 Prediction: 2 Odds diff: 0.43085259199142456
8 Prediction: 1 Odds diff: 0.2723160253630744
9 Prediction: 0 Odds diff: 0.3813531579393329


# Sanity check

To try to make sure we have correct data we will make predictions on matches this season that have already happenend and then compare these do our prediction we made previously at ~64%.

To do this we need to first combine EMA_stats with league stats, similar to above process.

In [48]:
league_data = pd.read_csv('data/league_data.csv')
league_data.drop([ 'Date', 'HomeTeam', 'AwayTeam', 
                  'gameId', 'HTGS', 'ATGS', 'HTGC',
                  'ATGC', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 
                  'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPts',
                 'ATFormPts', 'HTFormPtsStr', 'ATFormPtsStr'], 1, inplace=True)

In [49]:
league_data = league_data.loc[league_data['season'] == 1920].reset_index(drop=True)
league_data.drop([10, 30], inplace=True)
league_data.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,season,FTR,HTP,ATP,B365H,B365D,B365A,MW,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,4560,1920,H,0.000000,0.000000,1.14,10.00,19.00,1.0,0.000000,0.000000,0.000000,0.000000,0
1,4561,1920,A,0.000000,0.000000,12.00,6.50,1.22,1.0,0.000000,0.000000,0.000000,0.000000,0
2,4562,1920,D,0.000000,0.000000,1.95,3.60,3.60,1.0,0.000000,0.000000,0.000000,0.000000,0
3,4563,1920,H,0.000000,0.000000,2.62,3.20,2.75,1.0,0.000000,0.000000,0.000000,0.000000,0
4,4564,1920,D,0.000000,0.000000,3.00,3.25,2.37,1.0,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,4685,1920,A,1.076923,0.538462,1.36,5.10,8.00,13.0,-0.538462,-0.923077,0.538462,0.461538,0
124,4686,1920,A,0.615385,1.153846,2.30,3.40,3.10,13.0,-1.153846,-0.076923,-0.538462,0.000000,0
125,4687,1920,H,1.923077,2.000000,1.44,5.00,6.50,13.0,1.384615,1.000000,-0.076923,-0.461538,0
126,4688,1920,D,1.307692,1.230769,3.50,3.40,2.10,13.0,0.307692,0.000000,0.076923,0.153846,0


In [50]:
league_data

Unnamed: 0.1,Unnamed: 0,season,FTR,HTP,ATP,B365H,B365D,B365A,MW,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,4560,1920,H,0.000000,0.000000,1.14,10.00,19.00,1.0,0.000000,0.000000,0.000000,0.000000,0
1,4561,1920,A,0.000000,0.000000,12.00,6.50,1.22,1.0,0.000000,0.000000,0.000000,0.000000,0
2,4562,1920,D,0.000000,0.000000,1.95,3.60,3.60,1.0,0.000000,0.000000,0.000000,0.000000,0
3,4563,1920,H,0.000000,0.000000,2.62,3.20,2.75,1.0,0.000000,0.000000,0.000000,0.000000,0
4,4564,1920,D,0.000000,0.000000,3.00,3.25,2.37,1.0,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,4685,1920,A,1.076923,0.538462,1.36,5.10,8.00,13.0,-0.538462,-0.923077,0.538462,0.461538,0
126,4686,1920,A,0.615385,1.153846,2.30,3.40,3.10,13.0,-1.153846,-0.076923,-0.538462,0.000000,0
127,4687,1920,H,1.923077,2.000000,1.44,5.00,6.50,13.0,1.384615,1.000000,-0.076923,-0.461538,0
128,4688,1920,D,1.307692,1.230769,3.50,3.40,2.10,13.0,0.307692,0.000000,0.076923,0.153846,0


In [51]:
league_data.columns

Index(['Unnamed: 0', 'season', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D', 'B365A',
       'MW', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [52]:
league_data.drop(['MW', 'Unnamed: 0', 'season'], 1, inplace=True)
# league_data.reset_index(inplace=True)

In [53]:
league_data.head()

Unnamed: 0,FTR,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,H,0.0,0.0,1.14,10.0,19.0,0.0,0.0,0.0,0.0,0
1,A,0.0,0.0,12.0,6.5,1.22,0.0,0.0,0.0,0.0,0
2,D,0.0,0.0,1.95,3.6,3.6,0.0,0.0,0.0,0.0,0
3,H,0.0,0.0,2.62,3.2,2.75,0.0,0.0,0.0,0.0,0
4,D,0.0,0.0,3.0,3.25,2.37,0.0,0.0,0.0,0.0,0


In [54]:
len(league_data.columns)

11

In [55]:
df_ema.columns

Index(['gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

In [56]:
len(df_ema)

4642

In [57]:
ema1920 = df_ema.loc[(df_ema['gameId'] > 4559) & (df_ema['gameId'] < 4680)].reset_index()

In [58]:
len(ema1920)

118

In [59]:
len(league_data)

128

In [60]:
df1920 = pd.concat([ema1920, league_data], axis=1)

In [40]:
df1920.columns

Index(['index', 'gameId', 'f_cornersAgainstHome', 'f_cornersForHome',
       'f_freesAgainstHome', 'f_freesForHome', 'f_goalsAgainstHome',
       'f_goalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D',
       'B365A', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [41]:
df1920.drop(['index', 'gameId'], 1, inplace=True)

In [42]:
df1920.columns

Index(['f_cornersAgainstHome', 'f_cornersForHome', 'f_freesAgainstHome',
       'f_freesForHome', 'f_goalsAgainstHome', 'f_goalsForHome',
       'f_redsAgainstHome', 'f_redsForHome', 'f_shotsAgainstHome',
       'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D',
       'B365A', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [43]:
X = df1920.drop(['FTR'], 1)
y_true = df1920['FTR']

In [44]:
X.shape[1]

34

In [45]:
def transform_results(results):
    transformed = []
    for i in range(len(results)):
        if results[i] == 'H':
            transformed.append(0)
        elif results[i] == 'A':
            transformed.append(2)
        else:
            transformed.append(1)
    return np.array(transformed)

y_true = transform_results(y_true)

In [46]:
y_true

array([0, 2, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 0, 0, 2, 1, 0, 1, 1, 0, 2,
       2, 2, 2, 2, 0, 2, 2, 1, 1, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 2,
       0, 2, 0, 0, 1, 1, 2, 0, 0, 2, 0, 1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 0,
       0, 2, 0, 1, 0, 0, 0, 2, 1, 2, 0, 2, 2, 0, 0, 0, 1, 0, 0, 1, 1, 2,
       1, 0, 2, 0, 0, 1, 1, 2, 1, 1, 0, 2, 0, 1, 2, 0, 0, 0, 2, 2, 2, 1,
       2, 0, 0, 0, 2, 1, 0, 0, 0, 0, 2, 1, 2, 2, 2, 2, 2, 0, 1, 1])

In [47]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [48]:
model = load_model('17Nov19.h5')


_, acc = model.evaluate(X, y_true)
acc



0.5769230723381042

In [49]:
y_preds = model.predict(X)

funds = 100
wager = 10
favourites = 0
no_bets = 0
min_diff = 0.03

for i in range(len(X)):
    prediction = np.argmax(y_preds[i])
    print('\nPrediction', prediction)
    print('Actual', y_true[i])
    print('Favourite', np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]))
    print('Prediction proba', y_preds[i])
    print('Home, Draw and Away odds', df1920['B365H'][i],       
          df1920['B365D'][i], df1920['B365A'][i])
    
        

    if prediction == 0:
        odds_diff = y_preds[i][prediction] - (1/df1920['B365H'][i])
        # If odds_diff positive place bet
        if odds_diff > min_diff:
            if prediction == np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]):
                favourites +=1
                
            if  prediction == y_true[i]:
                funds += (wager * df1920['B365H'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    elif prediction == 1:
        odds_diff = y_preds[i][prediction] - (1/df1920['B365D'][i])
        if odds_diff > min_diff:
            if prediction == np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]):
                favourites +=1
            if  prediction == y_true[i]:
                funds +=( wager * df1920['B365D'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1
    else:
        odds_diff = y_preds[i][prediction] - (1/df1920['B365A'][i])
        if odds_diff >  min_diff:
            if prediction == np.argmin([df1920['B365H'][i], df1920['B365D'][i], 
                                  df1920['B365A'][i]]):
                favourites +=1
            if  prediction == y_true[i]:
                funds += (wager * df1920['B365A'][i]) - wager
            else:
                funds -= wager
        else:
            no_bets +=1

        
    
    
    print('Funds', funds)
    
print(f'Betted on favourite {favourites} times out of {len(X)} matches.')
print(f'No bet placed {no_bets} times')
         


Prediction 0
Actual 0
Favourite 0
Prediction proba [0.5801569  0.2492719  0.17057115]
Home, Draw and Away odds 1.14 10.0 19.0
Funds 100

Prediction 2
Actual 2
Favourite 2
Prediction proba [0.28566477 0.27336773 0.44096753]
Home, Draw and Away odds 12.0 6.5 1.22
Funds 100

Prediction 0
Actual 1
Favourite 0
Prediction proba [0.39058346 0.3023451  0.3070714 ]
Home, Draw and Away odds 1.95 3.6 3.6
Funds 100

Prediction 1
Actual 0
Favourite 0
Prediction proba [0.28829902 0.37467125 0.33702976]
Home, Draw and Away odds 2.62 3.2 2.75
Funds 90

Prediction 2
Actual 1
Favourite 2
Prediction proba [0.31983787 0.31477454 0.3653876 ]
Home, Draw and Away odds 3.0 3.25 2.37
Funds 90

Prediction 0
Actual 2
Favourite 0
Prediction proba [0.35912284 0.35103735 0.28983983]
Home, Draw and Away odds 1.9 3.4 4.0
Funds 90

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.3952686  0.28187263 0.3228588 ]
Home, Draw and Away odds 1.3 5.25 10.0
Funds 90

Prediction 2
Actual 1
Favourite 0
Prediction proba [0

Prediction proba [0.51503617 0.3000656  0.18489824]
Home, Draw and Away odds 1.45 4.75 6.5
Funds 804.9000000000002

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.7416981  0.17680036 0.08150156]
Home, Draw and Away odds 1.5 4.33 6.5
Funds 809.9000000000002

Prediction 2
Actual 2
Favourite 2
Prediction proba [0.1281475  0.29678163 0.57507086]
Home, Draw and Away odds 4.33 4.0 1.75
Funds 809.9000000000002

Prediction 0
Actual 0
Favourite 2
Prediction proba [0.6345956  0.28307435 0.08233006]
Home, Draw and Away odds 3.25 3.6 2.15
Funds 832.4000000000002

Prediction 1
Actual 1
Favourite 0
Prediction proba [0.310491   0.41042197 0.279087  ]
Home, Draw and Away odds 1.75 3.75 4.75
Funds 859.9000000000002

Prediction 0
Actual 2
Favourite 2
Prediction proba [0.5216961  0.25847855 0.21982536]
Home, Draw and Away odds 9.0 5.0 1.33
Funds 849.9000000000002

Prediction 0
Actual 0
Favourite 0
Prediction proba [0.5230571  0.31706712 0.15987583]
Home, Draw and Away odds 1.72 4.2 4.33
Funds 849.

In [50]:
headers = ['f_cornersAgainstHome', 'f_cornersForHome', 'f_freesAgainstHome',
       'f_freesForHome', 'f_goalsAgainstHome', 'f_goalsForHome',
       'f_redsAgainstHome', 'f_redsForHome', 'f_shotsAgainstHome',
       'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_cornersAgainstAway', 'f_cornersForAway',
       'f_freesAgainstAway', 'f_freesForAway', 'f_goalsAgainstAway',
       'f_goalsForAway', 'f_redsAgainstAway', 'f_redsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'HTP', 'ATP', 'B365H', 'B365D',
       'B365A', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP']
X_testdf = pd.DataFrame(np.load('X_test.npy'), columns=headers)

In [51]:
Xdf = pd.DataFrame(X, columns=headers)

In [52]:
X_testdf.describe()

Unnamed: 0,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,...,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
count,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,...,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0
mean,4.059253e-16,3.504141e-16,-9.228729e-16,-1.221245e-15,2.463307e-16,-5.2041700000000006e-17,0.0,1.457168e-16,1.387779e-17,1.734723e-16,...,3.469447e-18,6.591949e-17,-1.179612e-16,3.400058e-16,1.387779e-17,-2.2551410000000003e-17,1.387779e-17,-1.0408340000000001e-17,-1.0408340000000001e-17,0.0
std,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,...,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,0.0
min,-2.876803,-1.572654,-2.294211,-1.979669,-2.462765,-1.484403,-1.346104,-1.627226,-2.68641,-1.364284,...,-1.814545,-1.698789,-0.7633707,-0.7187656,-0.7777768,-2.267221,-2.441672,-2.442455,-2.598285,0.0
25%,-0.2915254,-0.7009053,-0.8852358,-0.8386421,-0.4532835,-0.5828828,-0.687282,-0.9743698,-0.4710227,-0.6241458,...,-0.5913042,-0.5640143,-0.5487847,-0.5473759,-0.5452881,-0.6144326,-0.5686638,-0.5573801,-0.5606832,0.0
50%,0.1952006,-0.09361949,0.2327939,0.1025779,-0.02004245,-0.2062275,-0.270034,0.008755791,0.2546434,-0.2600768,...,0.02201666,-0.02103877,-0.2922797,-0.3902687,-0.3515475,-0.02906995,0.02806816,0.03790672,0.02148882,0.0
75%,0.6859132,0.5148701,0.7367741,0.5854426,0.6796946,0.3996163,0.519961,0.784545,0.7373478,0.4160942,...,0.5435094,0.5327741,0.06203654,0.2238778,0.2401467,0.430038,0.4713548,0.5339791,0.4581179,0.0
max,1.611019,3.317503,1.944179,5.857462,1.968712,3.015484,3.001665,2.819847,1.568443,3.011515,...,2.827874,2.609508,4.797898,4.937095,4.429132,2.955132,2.877768,2.419054,2.815915,0.0


In [53]:
Xdf.describe()

Unnamed: 0,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,...,HTP,ATP,B365H,B365D,B365A,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
count,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,118.0,...,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0
mean,-2.559158e-16,5.983914e-16,-4.986595e-16,1.091406e-15,5.268855e-17,2.182811e-16,-9.032323e-17,2.521523e-16,1.919369e-16,-4.516161e-17,...,-2.775558e-17,8.326673e-17,-1.387779e-17,-6.245005e-17,-9.194034000000001e-17,-2.0816680000000002e-17,1.0408340000000001e-17,-1.561251e-17,-1.387779e-17,0.0
std,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,1.004264,...,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,1.003929,0.0
min,-2.703692,-1.880066,-2.097742,-1.76274,-2.81105,-1.583511,-1.35152,-1.561604,-2.590614,-1.514108,...,-1.746199,-1.653854,-0.761758,-0.7111791,-0.7747397,-2.263941,-2.454305,-2.435878,-2.5893,0.0
25%,-0.5214479,-0.5678237,-0.7860244,-0.9534244,-0.4988796,-0.6606755,-0.7572809,-0.8621946,-0.402557,-0.6145519,...,-0.5918029,-0.5894526,-0.5189646,-0.5396276,-0.5331496,-0.6076481,-0.5441407,-0.5642534,-0.556075,0.0
50%,0.1869372,-0.07354033,-0.05058669,0.3406931,0.1179681,-0.3639958,-0.09982489,-0.1010463,0.2213909,-0.2420183,...,-0.01460489,-0.05725208,-0.2901593,-0.3823721,-0.3462989,-0.02104422,0.02031609,0.04075199,0.0248465,0.0
75%,0.5820257,0.3911115,0.8791974,0.6971814,0.7229534,0.7096444,0.4112031,0.8280316,0.6269609,0.3205628,...,0.5625931,0.5414735,0.04955158,0.2323539,0.09056339,0.4124941,0.4644788,0.5360781,0.4605376,0.0
max,2.588489,3.129622,1.719248,2.062294,1.904387,2.718333,2.831743,2.431001,2.176648,2.94035,...,2.799235,2.60375,4.805504,4.950019,4.459186,2.969485,2.875648,2.418317,2.81327,0.0


In [54]:
test_df = pd.read_csv('data/all_seasons_joined.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [55]:
test_df = test_df.loc[test_df['season'] == 1920]

In [56]:
len(test_df)

130

In [57]:
test_df[30:31]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
4591,4591,30,E0,31/08/2019,Southampton,Man United,1.0,1.0,D,0.0,...,1.91,0.5,1.94,1.99,1.94,1.99,1.95,2.0,1.92,1.97
