# Prepare Data

In [0]:
# import drive 
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

# copy files to current folder
!mv 'drive/My Drive/IE582Fall2019_data_files/' .

# unzip all of the files
!unzip 'IE582Fall2019_data_files/bets.zip'
!unzip 'IE582Fall2019_data_files/booking.zip'
!unzip 'IE582Fall2019_data_files/goals.zip'
!unzip 'IE582Fall2019_data_files/matches.zip'
!unzip 'IE582Fall2019_data_files/stats.zip'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/
mv: cannot remove 'drive/My Drive/IE582Fall2019_data_files/submission_schedule.xlsx': Operation not permitted
mv: cannot remove 'drive/My Drive/IE582Fall2019_data_files/matches.zip': Operation not permitted
mv: cannot remove 'drive/My Drive/IE582Fall2019_data_files/bets.zip': Operation not permitted
mv: cannot remove 'drive/My Drive/IE582Fall2019_data_files/stats.zip': Operation not permitted
mv: cannot remove 'drive/My Drive/IE582Fall2019_data_files/booking.

In [0]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
LEAGUE_ID = 148 # PREMIER LEAGUE ID

In [0]:
# read match csv data into pandas frame
matches = pd.read_csv('matches.csv')

# filter desired league matches
matches = matches[matches['league_id'] == LEAGUE_ID]

# filter finished matches to train set
train_matches = matches[matches['match_status'] == 'Finished']

# filter unfinished matches to test set
test_matches = matches[~(matches['match_status'] == 'Finished')]

# visualize data for sanity check
test_matches.head(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id
6071,2626,2627,337563,1578423600,,0,Manchester Utd,Manchester City,,,,,,,,,148
6075,2632,2611,337589,1578510000,,0,Leicester,Aston Villa,,,,,,,,,148
6085,2620,2654,273318,1578682800,,0,Sheffield Utd,West Ham,,,,,,,,,148
6086,2617,2619,273314,1578742200,,0,Crystal Palace,Arsenal,,,,,,,,,148
6089,2613,2612,273315,1578751200,,0,Everton,Brighton,,,,,,,,,148
6090,2614,2611,273316,1578751200,,0,Leicester,Southampton,,,,,,,,,148
6093,2629,2616,273313,1578751200,,0,Chelsea,Burnley,,,,,,,,,148
6094,2630,2646,273320,1578751200,,0,Wolves,Newcastle,,,,,,,,,148
6099,2641,2627,273317,1578751200,,0,Manchester Utd,Norwich,,,,,,,,,148
6106,2621,2628,273319,1578760200,,0,Tottenham,Liverpool,,,,,,,,,148


# Utilities

In [0]:
# calculates prediction rps loss
def rps(targets, preds):
    if (len(targets.shape) == 1):
        _targets = np.zeros((targets.size, preds.shape[1]))
        _targets[np.arange(targets.size), targets] = 1
        targets = _targets
    
    preds = np.cumsum(preds, axis=1)
    targets = np.cumsum(targets, axis=1)
    loss = np.square(preds - targets)
    loss = np.mean(np.sum(loss, axis=1))
    return loss / (preds.shape[1] - 1)

# calculates submission string from match ids and predictions
def submission(ids, preds):
    string = ''
    for i in range(len(ids)):
        string = string + str(ids[i])
        for j in range(3):
            string = string + ', '
            string = string + str(float(preds[i][j]))
        string = string + ', '
    return string[:-2]

In [0]:
# get round matches: 0 refers to current week, increase round to access *earlier rounds*
def get_round(round):
    if round == 0:
        return test_matches.nsmallest(10, ['epoch'])
    return train_matches.nlargest(round * 10, ['epoch'])[-10:]

# calculates targets
def get_targets(matches):
    home = (matches['match_hometeam_score'] > matches['match_awayteam_score']).values
    tie = (matches['match_hometeam_score'] == matches['match_awayteam_score']).values
    away = (matches['match_hometeam_score'] < matches['match_awayteam_score']).values
    return np.transpose(np.stack([home, tie, away])).astype('float')

In [0]:
#round_ids = [273254, 273252, 273258, 273259, 273255, 273251, 273256, 273257, 273253, 273260]

round_matches = get_round(0)
round_ids = round_matches['match_id'].values

round_matches.head(10)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id
6071,2626,2627,337563,1578423600,,0,Manchester Utd,Manchester City,,,,,,,,,148
6075,2632,2611,337589,1578510000,,0,Leicester,Aston Villa,,,,,,,,,148
6085,2620,2654,273318,1578682800,,0,Sheffield Utd,West Ham,,,,,,,,,148
6086,2617,2619,273314,1578742200,,0,Crystal Palace,Arsenal,,,,,,,,,148
6089,2613,2612,273315,1578751200,,0,Everton,Brighton,,,,,,,,,148
6090,2614,2611,273316,1578751200,,0,Leicester,Southampton,,,,,,,,,148
6093,2629,2616,273313,1578751200,,0,Chelsea,Burnley,,,,,,,,,148
6094,2630,2646,273320,1578751200,,0,Wolves,Newcastle,,,,,,,,,148
6099,2641,2627,273317,1578751200,,0,Manchester Utd,Norwich,,,,,,,,,148
6106,2621,2628,273319,1578760200,,0,Tottenham,Liverpool,,,,,,,,,148


In [0]:
test_matches.head(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id
6071,2626,2627,337563,1578423600,,0,Manchester Utd,Manchester City,,,,,,,,,148
6075,2632,2611,337589,1578510000,,0,Leicester,Aston Villa,,,,,,,,,148
6085,2620,2654,273318,1578682800,,0,Sheffield Utd,West Ham,,,,,,,,,148
6086,2617,2619,273314,1578742200,,0,Crystal Palace,Arsenal,,,,,,,,,148
6089,2613,2612,273315,1578751200,,0,Everton,Brighton,,,,,,,,,148
6090,2614,2611,273316,1578751200,,0,Leicester,Southampton,,,,,,,,,148
6093,2629,2616,273313,1578751200,,0,Chelsea,Burnley,,,,,,,,,148
6094,2630,2646,273320,1578751200,,0,Wolves,Newcastle,,,,,,,,,148
6099,2641,2627,273317,1578751200,,0,Manchester Utd,Norwich,,,,,,,,,148
6106,2621,2628,273319,1578760200,,0,Tottenham,Liverpool,,,,,,,,,148


In [0]:
prev_round_matches = get_round(1)
prev_round_ids = prev_round_matches['match_id'].values

print(get_targets(prev_round_matches))
prev_round_matches.head(10)

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id
6061,2654,2621,273304,1577991600,Finished,0,Liverpool,Sheffield Utd,2.0,0.0,1.0,0.0,,,,,148
6058,2627,2617,273301,1577905200,Finished,0,Arsenal,Manchester Utd,2.0,0.0,2.0,0.0,,,,,148
6055,2612,2626,273305,1577896200,Finished,0,Manchester City,Everton,2.0,1.0,0.0,0.0,,,,,148
6056,2615,2620,273310,1577896200,Finished,0,West Ham,Bournemouth,4.0,0.0,3.0,0.0,,,,,148
6057,2619,2641,273307,1577896200,Finished,0,Norwich,Crystal Palace,1.0,1.0,1.0,0.0,,,,,148
6043,2611,2630,273306,1577887200,Finished,0,Newcastle,Leicester,0.0,3.0,0.0,2.0,,,,,148
6045,2628,2614,273308,1577887200,Finished,0,Southampton,Tottenham,1.0,0.0,1.0,0.0,,,,,148
6049,2646,2623,273309,1577887200,Finished,0,Watford,Wolves,2.0,1.0,1.0,0.0,,,,,148
6040,2616,2613,273302,1577878200,Finished,0,Brighton,Chelsea,1.0,1.0,0.0,1.0,,,,,148
6041,2632,2629,273303,1577878200,Finished,0,Burnley,Aston Villa,1.0,2.0,0.0,2.0,,,,,148


# Naive Forecasts

## Fixed Predictions

Predict fixed probability distribution for each match.

In [0]:
prob = [0.5, 0.25, 0.25]
def constant_preds(matches):
    preds = []
    for match in matches['match_id'].values:
        preds.append(np.asarray(prob))
    return np.asarray(preds)

train_preds = constant_preds(train_matches)
preds = constant_preds(round_matches)

print(rps(get_targets(train_matches), train_preds))
print(submission(round_ids, preds))

0.23304521276595744
337563, 0.5, 0.25, 0.25, 337589, 0.5, 0.25, 0.25, 273318, 0.5, 0.25, 0.25, 273314, 0.5, 0.25, 0.25, 273315, 0.5, 0.25, 0.25, 273316, 0.5, 0.25, 0.25, 273313, 0.5, 0.25, 0.25, 273320, 0.5, 0.25, 0.25, 273317, 0.5, 0.25, 0.25, 273319, 0.5, 0.25, 0.25


## Mimic Bookmaker

Mimic predictions of best bookmaker.

In [0]:
# prepare data by combining bets with matches information (necessary columns)
def prepare_data(league):
    # read bets csv data into pandas frame
    bets = pd.read_csv('bets.csv')
    matches = pd.read_csv('matches.csv')

    # filter unfinished matches
    matches = matches[matches['match_status'] == 'Finished']

    # filter desired league matches
    matches = matches[matches['league_id'] == league]

    # filter unrelated bets
    bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

    # append league ids and scores to data
    return pd.merge(bets, matches[['match_id', 'epoch', 'match_hometeam_score', 'match_awayteam_score']], on='match_id')

# get merged bets data
bets = prepare_data(LEAGUE_ID)

# visualize data for sanity check
bets.head()

Unnamed: 0,match_id,odd_bookmakers,odd_epoch,variable,value,epoch,match_hometeam_score,match_awayteam_score
0,150842,1xBet,1486711421,odd_1,5.1,1541942100,0.0,0.0
1,150842,Marathonbet,1486711421,odd_1,5.2,1541942100,0.0,0.0
2,150842,1xBet,1486711421,odd_x,3.7,1541942100,0.0,0.0
3,150842,Marathonbet,1486711421,odd_x,3.7,1541942100,0.0,0.0
4,150842,1xBet,1486711421,odd_2,1.77,1541942100,0.0,0.0


In [0]:
# manually mimic bookmakers
epoch = 0
bookmaker = ''
odds = {}

data = []
for match_id in round_ids:
    data.append([ match_id, bookmaker, epoch, epoch, 'odd_1', odds[match_id][0] ])
    data.append([ match_id, bookmaker, epoch, epoch, 'odd_x', odds[match_id][1] ])
    data.append([ match_id, bookmaker, epoch, epoch, 'odd_2', odds[match_id][2] ])

df = pd.DataFrame(data, columns=['match_id', 'odd_bookmakers', 'epoch', 'odd_epoch', 'variable', 'value'])
bets = bets.append(df, sort=True)

KeyError: ignored

In [0]:
round_matches.head(10)

In [0]:
# minimal number of odds we want from each bookmaker 
NUMBER_OF_ODDS = 10

# consider bets after this particular epoch
MIN_EPOCH = 1567339200 # 01 August, 2019

# filter bookmakers
def filter_bookmakers(bets, min_odds):
    # sample bets randomly
    bookmakers = bets.sample(frac=1)

    # group bookmakers
    bookmakers = bookmakers.groupby('odd_bookmakers')

    # drop bookmakers with small number of entries
    bookmakers = bookmakers.filter(lambda group: group.shape[0] > min_odds)

    # drop duplicates
    bookmakers = bookmakers.drop_duplicates(subset='odd_bookmakers')

    # extract values
    bookmakers = bookmakers['odd_bookmakers'].values

    return bookmakers

# filter bets made after the game start and before min epoch
def filter_bets(bets, min_epoch):
    # filter very old bets
    bets = bets[bets['odd_epoch'] > min_epoch]

    # filter bets made after match start
    bets = bets[bets['odd_epoch'] <= bets['epoch']]

    # drop duplicates and select bet with latest epoch
    bets = bets.sort_values('odd_epoch', ascending=False).drop_duplicates(['match_id', 'odd_bookmakers', 'variable']).sort_index()

    return bets

# sample bookmakers
bookmakers = filter_bookmakers(bets, NUMBER_OF_ODDS)

# filter bookmakers
bets = bets[bets['odd_bookmakers'].isin(bookmakers)]


# filter bets
bets = filter_bets(bets, MIN_EPOCH)

# visualize data for sanity check
#bets[(bets['match_id'] == 224496) & (bets['odd_bookmakers'] == 'Interwetten.es')]
bets.head(100)

In [0]:
# calculates probabilities from bets 
def calc_prob(bets):
    # calculate naive probabilities
    bets['value_reciprocal'] = 1 / bets['value']

    # calculate adjusted probabilities
    def adjust_group(group):
        tot = group['value_reciprocal'].sum()
        group['value_reciprocal_adjusted'] = group['value_reciprocal'] / tot
        return group

    # seperate data into groups on match, bookmaker and epoch
    groups = bets.groupby(['match_id', 'odd_bookmakers', 'odd_epoch'])

    # add adjusted reciprocals to groups
    groups = groups.apply(lambda group: adjust_group(group))

    # add new column to original data
    bets['value_reciprocal_adjusted'] = groups.transform('value_reciprocal_adjusted').values

    # visualize data for sanity check
    bets.head()

calc_prob(bets)
bets.head()

In [0]:
bets = bets[bets['match_id'].isin(round_ids)]

def get_bookmaker_preds(matches):
    def get_preds(group):
        odd_1 = group[group['variable'] == 'odd_1']['value_reciprocal_adjusted'].values[0]
        odd_x = group[group['variable'] == 'odd_x']['value_reciprocal_adjusted'].values[0]
        odd_2 = group[group['variable'] == 'odd_2']['value_reciprocal_adjusted'].values[0]
        return group['match_id'].values[0], [odd_1, odd_x, odd_2]

    values = matches.groupby('match_id').apply(get_preds).values
    string = ''
    for i in range(len(values)):
        string = string + ', ' + str(values[i][0])
        string = string + ', ' + str(float(values[i][1][0]))
        string = string + ', ' + str(float(values[i][1][1]))
        string = string + ', ' + str(float(values[i][1][2]))
    return string

print(get_bookmaker_preds(bets))
bets.head(10)

# Feature Engineering

In [0]:
features = ['last_n_home_goals', 'last_n_away_goals', 'last_n_home_points', 'last_n_away_points', 'last_n_goals', 'last_n_points']

In [0]:
start_epoch = 1559390400
end_epoch = 1578052800
only_league = True

# read match csv data into pandas frame
matches = pd.read_csv('matches.csv')

# filter desired league matches
matches = matches[matches['league_id'] == LEAGUE_ID]

# filter desired timespan
matches = matches[(matches['epoch'] > start_epoch) & (matches['epoch'] < end_epoch)]

# filter only to league matches
if only_league:
    matches = matches[matches['match_id'] < 300000] 

# print matches
matches.tail()

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id
6055,2612,2626,273305,1577896200,Finished,0,Manchester City,Everton,2.0,1.0,0.0,0.0,,,,,148
6056,2615,2620,273310,1577896200,Finished,0,West Ham,Bournemouth,4.0,0.0,3.0,0.0,,,,,148
6057,2619,2641,273307,1577896200,Finished,0,Norwich,Crystal Palace,1.0,1.0,1.0,0.0,,,,,148
6058,2627,2617,273301,1577905200,Finished,0,Arsenal,Manchester Utd,2.0,0.0,2.0,0.0,,,,,148
6061,2654,2621,273304,1577991600,Finished,0,Liverpool,Sheffield Utd,2.0,0.0,1.0,0.0,,,,,148


In [0]:
# GOALS IN LAST X HOME MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_home_goals'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_hometeam_id == team_id]
    cumulative_goals = team_matches.match_hometeam_score.cumsum()
    matches.loc[matches.match_hometeam_id == team_id, 'last_n_home_goals'] = (cumulative_goals.shift(1) -
                                                                              cumulative_goals.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals
6004,2615,2613,273292,1577532600,Finished,0,Brighton,Bournemouth,2.0,0.0,1.0,0.0,,,,,148,0.666667
6009,2612,2630,273296,1577541600,Finished,0,Newcastle,Everton,1.0,2.0,0.0,1.0,,,,,148,1.666667
6010,2619,2614,273298,1577541600,Finished,0,Southampton,Crystal Palace,1.0,1.0,0.0,0.0,,,,,148,1.333333
6011,2632,2623,273299,1577541600,Finished,0,Watford,Aston Villa,3.0,0.0,1.0,0.0,,,,,148,0.666667
6014,2611,2620,273300,1577550600,Finished,0,West Ham,Leicester,1.0,2.0,1.0,1.0,,,,,148,1.666667
6015,2628,2641,273297,1577550600,Finished,0,Norwich,Tottenham,2.0,2.0,1.0,0.0,,,,,148,1.333333
6016,2627,2629,273293,1577558700,Finished,0,Burnley,Manchester Utd,0.0,2.0,0.0,1.0,,,,,148,0.666667
6023,2616,2617,273291,1577624400,Finished,0,Arsenal,Chelsea,1.0,2.0,1.0,0.0,,,,,148,1.0
6037,2646,2621,273294,1577633400,Finished,0,Liverpool,Wolves,1.0,0.0,1.0,0.0,,,,,148,3.0
6038,2654,2626,273295,1577638800,Finished,0,Manchester City,Sheffield Utd,2.0,0.0,0.0,0.0,,,,,148,2.0


In [0]:
# GOALS IN LAST X AWAY MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_away_goals'] = float('NaN')

team_ids = matches.match_awayteam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_awayteam_id == team_id]
    cumulative_goals = team_matches.match_awayteam_score.cumsum()
    matches.loc[matches.match_awayteam_id == team_id, 'last_n_away_goals'] = (cumulative_goals.shift(1) -
                                                                              cumulative_goals.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals
6004,2615,2613,273292,1577532600,Finished,0,Brighton,Bournemouth,2.0,0.0,1.0,0.0,,,,,148,0.666667,1.0
6009,2612,2630,273296,1577541600,Finished,0,Newcastle,Everton,1.0,2.0,0.0,1.0,,,,,148,1.666667,1.333333
6010,2619,2614,273298,1577541600,Finished,0,Southampton,Crystal Palace,1.0,1.0,0.0,0.0,,,,,148,1.333333,0.666667
6011,2632,2623,273299,1577541600,Finished,0,Watford,Aston Villa,3.0,0.0,1.0,0.0,,,,,148,0.666667,1.0
6014,2611,2620,273300,1577550600,Finished,0,West Ham,Leicester,1.0,2.0,1.0,1.0,,,,,148,1.666667,2.333333
6015,2628,2641,273297,1577550600,Finished,0,Norwich,Tottenham,2.0,2.0,1.0,0.0,,,,,148,1.333333,2.0
6016,2627,2629,273293,1577558700,Finished,0,Burnley,Manchester Utd,0.0,2.0,0.0,1.0,,,,,148,0.666667,1.666667
6023,2616,2617,273291,1577624400,Finished,0,Arsenal,Chelsea,1.0,2.0,1.0,0.0,,,,,148,1.0,1.333333
6037,2646,2621,273294,1577633400,Finished,0,Liverpool,Wolves,1.0,0.0,1.0,0.0,,,,,148,3.0,2.0
6038,2654,2626,273295,1577638800,Finished,0,Manchester City,Sheffield Utd,2.0,0.0,0.0,0.0,,,,,148,2.0,1.333333


In [0]:
# POINTS IN LAST X HOME MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['match_hometeam_points'] = (matches['match_hometeam_score'] > matches['match_awayteam_score']) * 2 + (matches['match_hometeam_score'] >= matches['match_awayteam_score']) * 1
matches['last_n_home_points'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_hometeam_id == team_id]
    cumulative_points = team_matches['match_hometeam_points'].cumsum()
    matches.loc[matches.match_hometeam_id == team_id, 'last_n_home_points'] = (cumulative_points.shift(1) -
                                                                              cumulative_points.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points
6004,2615,2613,273292,1577532600,Finished,0,Brighton,Bournemouth,2.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,0.333333
6009,2612,2630,273296,1577541600,Finished,0,Newcastle,Everton,1.0,2.0,0.0,1.0,,,,,148,1.666667,1.333333,0,2.333333
6010,2619,2614,273298,1577541600,Finished,0,Southampton,Crystal Palace,1.0,1.0,0.0,0.0,,,,,148,1.333333,0.666667,1,2.0
6011,2632,2623,273299,1577541600,Finished,0,Watford,Aston Villa,3.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,1.333333
6014,2611,2620,273300,1577550600,Finished,0,West Ham,Leicester,1.0,2.0,1.0,1.0,,,,,148,1.666667,2.333333,0,0.0
6015,2628,2641,273297,1577550600,Finished,0,Norwich,Tottenham,2.0,2.0,1.0,0.0,,,,,148,1.333333,2.0,1,0.333333
6016,2627,2629,273293,1577558700,Finished,0,Burnley,Manchester Utd,0.0,2.0,0.0,1.0,,,,,148,0.666667,1.666667,0,1.0
6023,2616,2617,273291,1577624400,Finished,0,Arsenal,Chelsea,1.0,2.0,1.0,0.0,,,,,148,1.0,1.333333,0,0.333333
6037,2646,2621,273294,1577633400,Finished,0,Liverpool,Wolves,1.0,0.0,1.0,0.0,,,,,148,3.0,2.0,3,3.0
6038,2654,2626,273295,1577638800,Finished,0,Manchester City,Sheffield Utd,2.0,0.0,0.0,0.0,,,,,148,2.0,1.333333,3,2.0


In [0]:
# POINTS IN LAST X AWAY MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['match_awayteam_points'] = (matches['match_awayteam_score'] > matches['match_hometeam_score']) * 2 + (matches['match_awayteam_score'] >= matches['match_hometeam_score']) * 1
matches['last_n_away_points'] = float('NaN')

team_ids = matches.match_awayteam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_awayteam_id == team_id]
    cumulative_points = team_matches['match_awayteam_points'].cumsum()
    matches.loc[matches.match_awayteam_id == team_id, 'last_n_away_points'] = (cumulative_points.shift(1) -
                                                                              cumulative_points.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points,match_awayteam_points,last_n_away_points
6004,2615,2613,273292,1577532600,Finished,0,Brighton,Bournemouth,2.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,0.333333,0,1.0
6009,2612,2630,273296,1577541600,Finished,0,Newcastle,Everton,1.0,2.0,0.0,1.0,,,,,148,1.666667,1.333333,0,2.333333,3,0.333333
6010,2619,2614,273298,1577541600,Finished,0,Southampton,Crystal Palace,1.0,1.0,0.0,0.0,,,,,148,1.333333,0.666667,1,2.0,1,1.333333
6011,2632,2623,273299,1577541600,Finished,0,Watford,Aston Villa,3.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,1.333333,0,0.333333
6014,2611,2620,273300,1577550600,Finished,0,West Ham,Leicester,1.0,2.0,1.0,1.0,,,,,148,1.666667,2.333333,0,0.0,3,2.0
6015,2628,2641,273297,1577550600,Finished,0,Norwich,Tottenham,2.0,2.0,1.0,0.0,,,,,148,1.333333,2.0,1,0.333333,1,2.0
6016,2627,2629,273293,1577558700,Finished,0,Burnley,Manchester Utd,0.0,2.0,0.0,1.0,,,,,148,0.666667,1.666667,0,1.0,3,1.333333
6023,2616,2617,273291,1577624400,Finished,0,Arsenal,Chelsea,1.0,2.0,1.0,0.0,,,,,148,1.0,1.333333,0,0.333333,3,1.0
6037,2646,2621,273294,1577633400,Finished,0,Liverpool,Wolves,1.0,0.0,1.0,0.0,,,,,148,3.0,2.0,3,3.0,0,2.333333
6038,2654,2626,273295,1577638800,Finished,0,Manchester City,Sheffield Utd,2.0,0.0,0.0,0.0,,,,,148,2.0,1.333333,3,2.0,0,2.333333


In [0]:
# GOALS IN LAST X MATCHES
MATCH_COUNT = 5

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_goals'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[(matches.match_hometeam_id == team_id) | (matches.match_awayteam_id == team_id)]
    cumulative_goals = (team_matches.match_hometeam_score * (team_matches.match_hometeam_id == team_id) +
                        (team_matches.match_awayteam_score * (team_matches.match_awayteam_id == team_id))).cumsum()
    matches.loc[(matches.match_hometeam_id == team_id) |
                (matches.match_awayteam_id == team_id), 'last_n_goals'] = (cumulative_goals.shift(1) -
                                                                           cumulative_goals.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points,match_awayteam_points,last_n_away_points,last_n_goals
6004,2615,2613,273292,1577532600,Finished,0,Brighton,Bournemouth,2.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,0.333333,0,1.0,1.2
6009,2612,2630,273296,1577541600,Finished,0,Newcastle,Everton,1.0,2.0,0.0,1.0,,,,,148,1.666667,1.333333,0,2.333333,3,0.333333,1.4
6010,2619,2614,273298,1577541600,Finished,0,Southampton,Crystal Palace,1.0,1.0,0.0,0.0,,,,,148,1.333333,0.666667,1,2.0,1,1.333333,1.6
6011,2632,2623,273299,1577541600,Finished,0,Watford,Aston Villa,3.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,1.333333,0,0.333333,0.8
6014,2611,2620,273300,1577550600,Finished,0,West Ham,Leicester,1.0,2.0,1.0,1.0,,,,,148,1.666667,2.333333,0,0.0,3,2.0,1.6
6015,2628,2641,273297,1577550600,Finished,0,Norwich,Tottenham,2.0,2.0,1.0,0.0,,,,,148,1.333333,2.0,1,0.333333,1,2.0,0.8
6016,2627,2629,273293,1577558700,Finished,0,Burnley,Manchester Utd,0.0,2.0,0.0,1.0,,,,,148,0.666667,1.666667,0,1.0,3,1.333333,1.8
6023,2616,2617,273291,1577624400,Finished,0,Arsenal,Chelsea,1.0,2.0,1.0,0.0,,,,,148,1.0,1.333333,0,0.333333,3,1.0,1.0
6037,2646,2621,273294,1577633400,Finished,0,Liverpool,Wolves,1.0,0.0,1.0,0.0,,,,,148,3.0,2.0,3,3.0,0,2.333333,2.0
6038,2654,2626,273295,1577638800,Finished,0,Manchester City,Sheffield Utd,2.0,0.0,0.0,0.0,,,,,148,2.0,1.333333,3,2.0,0,2.333333,1.2


In [0]:
# POINTS IN LAST X MATCHES
MATCH_COUNT = 5

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_points'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[(matches.match_hometeam_id == team_id) | (matches.match_awayteam_id == team_id)]
    cumulative_points = (team_matches.match_hometeam_points * (team_matches.match_hometeam_id == team_id) +
                         (team_matches.match_awayteam_points * (team_matches.match_awayteam_id == team_id))).cumsum()
    matches.loc[(matches.match_hometeam_id == team_id) |
                (matches.match_awayteam_id == team_id), 'last_n_points'] = (cumulative_points.shift(1) -
                                                                           cumulative_points.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points,match_awayteam_points,last_n_away_points,last_n_goals,last_n_points
6004,2615,2613,273292,1577532600,Finished,0,Brighton,Bournemouth,2.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,0.333333,0,1.0,1.2,1.0
6009,2612,2630,273296,1577541600,Finished,0,Newcastle,Everton,1.0,2.0,0.0,1.0,,,,,148,1.666667,1.333333,0,2.333333,3,0.333333,1.4,1.6
6010,2619,2614,273298,1577541600,Finished,0,Southampton,Crystal Palace,1.0,1.0,0.0,0.0,,,,,148,1.333333,0.666667,1,2.0,1,1.333333,1.6,1.8
6011,2632,2623,273299,1577541600,Finished,0,Watford,Aston Villa,3.0,0.0,1.0,0.0,,,,,148,0.666667,1.0,3,1.333333,0,0.333333,0.8,0.6
6014,2611,2620,273300,1577550600,Finished,0,West Ham,Leicester,1.0,2.0,1.0,1.0,,,,,148,1.666667,2.333333,0,0.0,3,2.0,1.6,1.4
6015,2628,2641,273297,1577550600,Finished,0,Norwich,Tottenham,2.0,2.0,1.0,0.0,,,,,148,1.333333,2.0,1,0.333333,1,2.0,0.8,0.2
6016,2627,2629,273293,1577558700,Finished,0,Burnley,Manchester Utd,0.0,2.0,0.0,1.0,,,,,148,0.666667,1.666667,0,1.0,3,1.333333,1.8,2.0
6023,2616,2617,273291,1577624400,Finished,0,Arsenal,Chelsea,1.0,2.0,1.0,0.0,,,,,148,1.0,1.333333,0,0.333333,3,1.0,1.0,1.2
6037,2646,2621,273294,1577633400,Finished,0,Liverpool,Wolves,1.0,0.0,1.0,0.0,,,,,148,3.0,2.0,3,3.0,0,2.333333,2.0,2.0
6038,2654,2626,273295,1577638800,Finished,0,Manchester City,Sheffield Utd,2.0,0.0,0.0,0.0,,,,,148,2.0,1.333333,3,2.0,0,2.333333,1.2,2.0


In [0]:
epoch = 1575484200 # 4 December 2019 - train/test split
matches = matches.dropna(subset=features)
train_matches = matches[matches['epoch'] < epoch]
test_matches = matches[matches['epoch'] >= epoch]

In [0]:
train_x = train_matches[features]
test_x = test_matches[features]
#train_x = matches[matches['match_status'] == 'Finished']
#test_x = matches[~(matches['match_status'] == 'Finished')]

train_y = np.stack([
    train_matches['match_hometeam_score'] > train_matches['match_awayteam_score'],
    train_matches['match_hometeam_score'] == train_matches['match_awayteam_score'],
    train_matches['match_hometeam_score'] < train_matches['match_awayteam_score']
])
train_y = np.transpose(train_y)
train_y = np.argmax(train_y, axis=1)

test_y = np.stack([
    test_matches['match_hometeam_score'] > test_matches['match_awayteam_score'],
    test_matches['match_hometeam_score'] == test_matches['match_awayteam_score'],
    test_matches['match_hometeam_score'] < test_matches['match_awayteam_score']
])
test_y = np.transpose(test_y)
test_y = np.argmax(test_y, axis=1)

#fields = ['match_awayteam_id', 'match_hometeam_id']
#train_x = train_x[fields]
#test_x = test_x[fields]

In [0]:
"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categories='auto')
encoder.fit(np.reshape(train_y, [-1, 1]))

def rps_loss(y_true, y_preds):
    y_true = np.reshape(y_true, [-1, 1])
    y_preds = np.reshape(y_preds, [-1, 1])
    y_true = encoder.transform(y_true).toarray()
    y_preds = encoder.transform(y_preds).toarray()
    return rps(y_true, y_preds)

rps_scorer = make_scorer(rps_loss, greater_is_better=False)

random_forest = RandomForestClassifier()
print(np.mean(cross_val_score(random_forest, train_x, train_y, cv=10, scoring=rps_scorer)))

nsamples = 10
random_forest.fit(train_x[:-nsamples], train_y[:-nsamples])
preds = random_forest.predict(train_x[-nsamples:])

print(rps_loss(train_y[-nsamples:], preds))
print(train_y[-nsamples:])
print(preds)
"""

"\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics.scorer import make_scorer\nfrom sklearn.preprocessing import OneHotEncoder\n\nencoder = OneHotEncoder(categories='auto')\nencoder.fit(np.reshape(train_y, [-1, 1]))\n\ndef rps_loss(y_true, y_preds):\n    y_true = np.reshape(y_true, [-1, 1])\n    y_preds = np.reshape(y_preds, [-1, 1])\n    y_true = encoder.transform(y_true).toarray()\n    y_preds = encoder.transform(y_preds).toarray()\n    return rps(y_true, y_preds)\n\nrps_scorer = make_scorer(rps_loss, greater_is_better=False)\n\nrandom_forest = RandomForestClassifier()\nprint(np.mean(cross_val_score(random_forest, train_x, train_y, cv=10, scoring=rps_scorer)))\n\nnsamples = 10\nrandom_forest.fit(train_x[:-nsamples], train_y[:-nsamples])\npreds = random_forest.predict(train_x[-nsamples:])\n\nprint(rps_loss(train_y[-nsamples:], preds))\nprint(train_y[-nsamples:])\nprint(preds)\n"

# Cross Validation

In [0]:
from sklearn.metrics import make_scorer

def cv_results(cv):
    print('--- PARAMS ---')
    print(cv.cv_results_['params'])
    print()

    print('--- BEST PARAMS ---')
    print(cv.best_params_)
    print()

    print('--- TRAIN SCORE (LOSS/ACC) ---')
    print(cv.best_estimator_.score(train_x, train_y))
    print()

    print('--- TEST SCORE (LOSS/ACC) ---')
    print(cv.best_estimator_.score(test_x, test_y))
    print()

    print('--- TRAIN RPS ---')
    print(rps(train_y, cv.best_estimator_.predict_proba(train_x)))
    print()

    print('--- TEST RPS ---')
    print(rps(test_y, cv.best_estimator_.predict_proba(test_x)))

scoring = make_scorer(rps, greater_is_better=False, needs_proba=True)

# Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dt_cls_cv = GridSearchCV(DecisionTreeClassifier(),
                         scoring=scoring,
                         param_grid={
                             'min_samples_leaf': np.arange(1, 5, 1),
                         },
                         cv=10,
                         n_jobs=1)
dt_cls_cv.fit(train_x, train_y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=1,
             param_grid={'min_samples_leaf': array([1, 2, 3, 4])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=Fa

In [0]:
cv_results(dt_cls_cv)

--- PARAMS ---
[{'min_samples_leaf': 1}, {'min_samples_leaf': 2}, {'min_samples_leaf': 3}, {'min_samples_leaf': 4}]

--- BEST PARAMS ---
{'min_samples_leaf': 4}

--- TRAIN SCORE (LOSS/ACC) ---
0.75

--- TEST SCORE (LOSS/ACC) ---
0.31343283582089554

--- TRAIN RPS ---
0.12044270833333334

--- TEST RPS ---
0.3264572968490879


# Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

rf_cls_cv = GridSearchCV(RandomForestClassifier(),
                         scoring=scoring,
                         param_grid={
                             'n_estimators': np.arange(10, 50, 10),
                             'max_features': np.arange(1, 6, 1),
                             'min_samples_leaf': np.arange(1, 5, 1)
                         },
                         cv=10,
                         n_jobs=-1)
rf_cls_cv.fit(train_x, train_y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [0]:
cv_results(rf_cls_cv)

--- PARAMS ---
[{'max_features': 1, 'min_samples_leaf': 1, 'n_estimators': 10}, {'max_features': 1, 'min_samples_leaf': 1, 'n_estimators': 20}, {'max_features': 1, 'min_samples_leaf': 1, 'n_estimators': 30}, {'max_features': 1, 'min_samples_leaf': 1, 'n_estimators': 40}, {'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 10}, {'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 20}, {'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 30}, {'max_features': 1, 'min_samples_leaf': 2, 'n_estimators': 40}, {'max_features': 1, 'min_samples_leaf': 3, 'n_estimators': 10}, {'max_features': 1, 'min_samples_leaf': 3, 'n_estimators': 20}, {'max_features': 1, 'min_samples_leaf': 3, 'n_estimators': 30}, {'max_features': 1, 'min_samples_leaf': 3, 'n_estimators': 40}, {'max_features': 1, 'min_samples_leaf': 4, 'n_estimators': 10}, {'max_features': 1, 'min_samples_leaf': 4, 'n_estimators': 20}, {'max_features': 1, 'min_samples_leaf': 4, 'n_estimators': 30}, {'max_features': 1, 'min

# Stochastic Gradient Boosting

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

gb_cls_cv = GridSearchCV(GradientBoostingClassifier(),
                         scoring=scoring,
                         param_grid={
                             'learning_rate': np.arange(0.01, 0.1, 0.02),
                             'n_estimators': np.arange(10, 50, 10),
                             'max_depth': np.arange(1, 3, 1),
                             'min_samples_leaf': np.arange(1, 5, 1)
                         },
                         cv=10,
                         n_jobs=-1)
gb_cls_cv.fit(train_x, train_y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_n...
                                                  validation_fractio

In [0]:
cv_results(gb_cls_cv)

--- PARAMS ---
[{'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 10}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 20}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 30}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 40}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 2, 'n_estimators': 10}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 2, 'n_estimators': 20}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 2, 'n_estimators': 30}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 2, 'n_estimators': 40}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 10}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 20}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 30}, {'learning_rate': 0.01, 'max_depth': 1, 'min_samples_leaf': 