In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=5)
import matplotlib.cm as cm



SMALL_SIZE = 8
MEDIUM_SIZE = 25
BIGGER_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
# Constants 
fname = 'prepared.csv'

fname_2017 = "rg_2017.csv"
date_2017 = '2017-05-28'

fname_2018 = "rg_2018.csv"
date_2018 = '2018-05-27'

odds_cols = ['B365_1', 'EX_1', 'LB_1', 'CB_1', 'GB_1',  'IW_1',  'SB_1', 'SB_2', 'IW_2', 'GB_2', 'CB_2', 'LB_2', 'EX_2', 'B365_2', 'Max_1', 'Max_2','Avg_1','Avg_2', 'PS_1', 'PS_2']

In [None]:
def get_training_data(df, date=None):
    if date is not None:
        df = df[df.date < date]
    # X, y
    to_remove = ['outcome', 'name_1', 'name_2', 'date', 'level'] + odds_cols
    X_cols = [col for col in df if col not in to_remove]

    return df[X_cols], df["outcome"]

def get_test_data(df):
    return get_training_data(df)

In [None]:
# Strategies
def kelly_criterion(b, p):
    """
    Event with two mutually exclusive outcomes.
    :b: winning odd,
    :p: winning probability
    """
    return (b*p - (1 - p))/b

def clairvoyant_strategy(matches, starting_stake=1):
    """
    Clairvoyant Kelly Criterion strategy.
    Always bets the whole stake on the winner. 
    """
    odds = get_best_odds(matches)
    
    return starting_stake * np.cumprod([odds[:, 0]])

def kelly_strategy(matches, starting_stake=100):
    odds = get_best_odds(matches)
    implied_probas = get_implied_probas(odds)
    outcomes = matches['outcome'] # 1 = player 1, 0 = player 2
    
    stake = starting_stake
    history = []
    
    for odd, prob, o in zip(odds, implied_probas, outcomes):
        i = np.argmax(prob) # Best event
        b = odd[i]
        p = prob[i] 
        
        
        fraction = kelly_criterion(b, p)
        played_stake = fraction*stake
        
        if i == o: # lose
            stake -= played_stake
        else: 
            stake += (played_stake*b - played_stake)
        
        history.append(stake)
    return history


def naive_strategy(matches, probas, delta=0, constant_stake=1, starting_stake=0):
    odds = get_best_odds(matches)
    outcomes = matches['outcome'] # 1 = player 1, 0 = player 2
    implied_probas = get_implied_probas(odds)
    bankroll = starting_stake
    history = []
    successes, losses = [], []
    
    for odd, implied_prob, prob, o in zip(odds, implied_probas, probas, outcomes):
        played = False
        if prob[0] > implied_prob[0] and prob[0] > 0.5 + delta: # Player 1
            i = 0
            played = True
        elif prob[1] > implied_prob[1] and  prob[1] > 0.5  + delta: # Player 2
            i = 1
            played = True
        
        if played:
            if i == o: # lose
                bankroll -= constant_stake
                losses.append(constant_stake)
            else: 
                won = (constant_stake*odd[i] - constant_stake)
                bankroll += won
                successes.append(won)
            
        history.append(bankroll)
        
    return history, successes, losses




def advanced_strategy(matches, probas, delta=0, starting_stake=0, max_stake=1):
    odds = get_best_odds(matches)
    outcomes = matches['outcome'] # 1 = player 1, 0 = player 2
    implied_probas = get_implied_probas(odds)
    
    stake = starting_stake
    history = []
    successes, losses = [], []
    
    for odd, implied_prob, prob, o in zip(odds, implied_probas, probas, outcomes):
        i = np.argmax(prob) # Best event
        b = odd[i]
        p = prob[i] 
        
        if p >= implied_prob[i] and p >= 0.5 + delta:
            fraction = kelly_criterion(b, p)
            played_stake = fraction*max_stake
            
            if i == o: # lose
                stake -= played_stake
                losses.append(played_stake)
            else: 
                won = (played_stake*b - played_stake)
                stake += won
                successes.append(won)
                
        history.append(stake)
    return history, successes, losses

In [None]:
def get_best_odds(matches):
    """
    Returns the most avantageous odd for the given player (1 or 2).
    """
    if 'Max_1' in matches.columns:
        odds_1 = matches["Max_1"].values.reshape(-1, 1)
    else: 
        odds_1 = matches[['B365_1', 'EX_1', 'LB_1', 'CB_1', 'GB_1',  'IW_1', 'SB_1']].max(axis=1).values.reshape(-1, 1)
        
    if 'Max_2' in matches.columns:
        odds_2 = matches["Max_2"].values.reshape(-1, 1)
    else: 
        odds_2 = matches[['B365_2', 'EX_2', 'LB_2', 'CB_2', 'GB_2',  'IW_2', 'SB_2']].max(axis=1).values.reshape(-1, 1)

    return np.hstack((odds_1, odds_2))

def get_implied_probas(odds):
    """
        Returns the normalised probabilities implied by the odds.
    """
    _probas = np.divide(1, odds)
    norm = np.sum(_probas, axis=1).reshape(-1, 1)
    probas = np.divide(_probas, np.hstack((norm, norm)))
    
    return probas

In [None]:
# Whole dataset
df = pd.read_csv(fname, index_col=None, header=0, low_memory=False)
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")
df = df.replace([np.inf, -np.inf], 0).drop(['ht'], axis=1).dropna().sort_values(by='date')

print(df.shape)

In [None]:
# Roland-Garros 2017
from sklearn.linear_model import LogisticRegression

# Test data
test_df = pd.read_csv(fname_2017, index_col=None, header=0, low_memory=False).drop(['ht'], axis=1)


# Train data
X_train, y_train = get_training_data(df, date=date_2017)
print("Matches (training): " + str(X_train.shape)) # Each match appears twice


lr = LogisticRegression(penalty='l1', max_iter=1000).fit(X_train, y_train)

# Betting Setup 
X_test, y_test = get_test_data(test_df)
print("Matches (RG 2017): {}".format(X_test.shape))
X_test = X_test.fillna(0)
probas = lr.predict_proba(X_test) # col 0 = player 2 wins,   col 1 = player 1 wins
probas = np.flip(probas, 1)

# Naive
plt.figure(figsize=(60, 20))

deltas = np.arange(0, 0.5, .05)
colors = iter(cm.rainbow(np.linspace(0, 1, len(deltas))))

constant_stake = 5 # Naive constant stake

# Naive
print("===Naive (constant stake: {} )===".format(constant_stake))
print("Deltas | Sucesses | Losses | Return ")
for delta in deltas:
    bankroll, successes, losses = naive_strategy(test_df, probas, delta=delta, constant_stake=constant_stake)
    plt.fill_between(x=range(len(bankroll)), y1=0, y2=bankroll, color=next(colors), alpha=0.6)
    print("{0:.4f} | {1: <8} | {2: <6} | {3:.2f}".format(delta, len(successes), len(losses), bankroll[-1]-bankroll[0]))

    
plt.title("Roland Garros 2017 (Naive)", fontsize=50)
plt.xlabel("Matches", fontsize=30)
plt.xticks(fontsize=20)
plt.ylabel("Bankroll", fontsize=30)
plt.yticks(fontsize=20)
plt.legend(['{0:0.2f}'.format(d+0.5) for d in deltas], loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0., fontsize=30)
plt.savefig("naive_rg2017.png")


# Advanced
plt.figure(figsize=(60, 20))

deltas = np.arange(0, 0.5, .05)
colors = iter(cm.rainbow(np.linspace(0, 1, len(deltas))))

max_stake = 10 # Maximum variable stake

print("===Advanced (max stake: {} )===".format(max_stake))
print("Deltas | Sucesses | Losses | Return ")
for delta in deltas:
    bankroll, successes, losses = advanced_strategy(test_df, probas, delta=delta, max_stake=max_stake)
    plt.fill_between(x=range(len(bankroll)), y1=0, y2=bankroll, color=next(colors), alpha=0.6)
    
    print("{0:.4f} | {1: <8} | {2: <6} | {3:.2f}".format(delta, len(successes), len(losses), bankroll[-1]-bankroll[0]))
    
plt.title("Roland Garros 2017 (Advanced)", fontsize=50)
plt.xlabel("Matches", fontsize=30)
plt.xticks(fontsize=20)
plt.ylabel("Bankroll", fontsize=30)
plt.yticks(fontsize=20)
plt.legend(['{0:0.2f}'.format(d+0.5) for d in deltas], loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0., fontsize=30)
plt.savefig("advanced_rg2017.png")

In [None]:
# Roland-Garros 2018
from sklearn.linear_model import LogisticRegression

# Test data
test_df = pd.read_csv(fname_2018, index_col=None, header=0, low_memory=False).drop(['ht'], axis=1)


# Train data
X_train, y_train = get_training_data(df, date=date_2018)
print("Matches (training): " + str(X_train.shape)) # Each match appears twice


lr = LogisticRegression(penalty='l1', max_iter=1000).fit(X_train, y_train)

# Betting Setup 
X_test, y_test = get_test_data(test_df)
print("Matches (RG 2018): {}".format(X_test.shape))
X_test = X_test.fillna(0)
probas = lr.predict_proba(X_test) # col 0 = player 2 wins,   col 1 = player 1 wins
probas = np.flip(probas, 1)

# Naive
plt.figure(figsize=(60, 20))

deltas = np.arange(0, 0.5, .05)
colors = iter(cm.rainbow(np.linspace(0, 1, len(deltas))))
constant_stake = 5 

# Naive
print("===Naive (constant stake: {} )===".format(constant_stake))
print("Deltas | Sucesses | Losses | Return ")
for delta in deltas:
    bankroll, successes, losses = naive_strategy(test_df, probas, delta=delta, constant_stake=constant_stake)

    plt.fill_between(x=range(len(bankroll)), y1=0, y2=bankroll, color=next(colors), alpha=0.6)
    print("{0:.4f} | {1: <8} | {2: <6} | {3:.2f}".format(delta, len(successes), len(losses), bankroll[-1]-bankroll[0]))
    
plt.title("Roland Garros 2018 (Naive)", fontsize=50)
plt.xlabel("Matches", fontsize=30)
plt.xticks(fontsize=20)
plt.ylabel("Bankroll", fontsize=30)
plt.yticks(fontsize=20)
plt.legend(['{0:0.2f}'.format(d+0.5) for d in deltas], loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0., fontsize=30)
plt.savefig("naive_rg2018.png")


# Advanced
plt.figure(figsize=(60, 20))

deltas = np.arange(0, 0.5, .05)
colors = iter(cm.rainbow(np.linspace(0, 1, len(deltas))))
max_stake = 10

print("===Advanced (max stake: {} )===".format(max_stake))
print("Deltas | Sucesses | Losses | Return")
for delta in deltas:
    bankroll, successes, losses = advanced_strategy(test_df, probas, delta=delta, max_stake=max_stake)
    plt.fill_between(x=range(len(bankroll)), y1=0, y2=bankroll, color=next(colors), alpha=0.6)
    
    print("{0:.4f} | {1: <8} | {2: <6} | {3:.2f}".format(delta, len(successes), len(losses), bankroll[-1]-bankroll[0]))
    
plt.title("Roland Garros 2018 (Advanced)", fontsize=50)
plt.xlabel("Matches", fontsize=30)
plt.xticks(fontsize=20)
plt.ylabel("Bankroll", fontsize=30)
plt.yticks(fontsize=20)
plt.legend(['{0:0.2f}'.format(d+0.5) for d in deltas], loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0., fontsize=30)
plt.savefig("advanced_rg2018.png")