In [2]:
import pandas as pd
import seaborn as sns
import pickle
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
import econtools.metrics as mt
import numpy as np
from econtools import read, outreg, table_statrow, write_notes
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
import statistics as s 
plt.style.use('seaborn-v0_8-deep')

In [16]:
original_df = pd.read_csv(
    r'C:\Users\faxul\Google Drive\RamirezRepStudy\thesisminusdata\TradeCode\paper_repstudy\notFeelingTheBuzz\Datafiles\clegg2_matches_cleaned.csv')

# Removing Best odds utliers
uncleaned_rows = len(original_df)
original_df['difference'] = original_df['inverse_avg'] - original_df['inverse_best']
nanrows = len(original_df[original_df['difference'].isnull()]) + len(original_df[original_df["inverse_b365"].isnull()])
original_df = original_df[~original_df["inverse_b365"].isnull()]
print("Nan Rows: "+str(nanrows))

std_difference = original_df['difference'].std()
threshold = 9999 * std_difference
original_df = original_df[np.abs(original_df['difference']) <= threshold]
cleaned_rows = len(original_df)

fitset = original_df[~(original_df['date'] > '2020-01-00')].copy()
testset = original_df[(original_df['date'] > '2020-01-00')].copy()

print('Removed rows: ' + str(uncleaned_rows - cleaned_rows - nanrows))
print("Total Rows: "+ str(len(original_df)))
print('Fit set rows: ' + str(len(fitset)))
print('Test set rows: ' + str(len(testset)))
print("Total Matches: " + str(len(original_df["match_id"].value_counts())))

Nan Rows: 169
Removed rows: -30
Total Rows: 27067
Fit set rows: 16161
Test set rows: 10906
Total Matches: 13544


In [17]:
original_df

Unnamed: 0.1,Unnamed: 0,match_id,WTA,player,date,rankdist,wikibuzz,outcome,inverse_best,inverse_b365,inverse_avg,difference
0,0,1540,34,Sloane_Stephens,2016-07-02,-0.037518,-1.613658,1,0.826446,0.862069,0.854701,0.028255
1,1,1540,34,Mandy_Minella,2016-07-02,0.037518,-1.613658,0,0.178571,0.200000,0.202020,0.023449
2,2,1541,34,Timea_Bacsinszky,2016-07-02,-0.069632,-0.179462,1,0.746269,0.781250,0.775194,0.028925
3,3,1541,34,Monica_Niculescu,2016-07-02,0.069632,-0.179462,0,0.259740,0.266667,0.283286,0.023546
4,4,1542,34,Ekaterina_Makarova,2016-07-02,0.071429,1.091749,1,0.319489,0.363636,0.347222,0.027733
...,...,...,...,...,...,...,...,...,...,...,...,...
27201,27221,15831,55,Caroline_Garcia,2022-11-06,0.033333,1.278589,1,0.404858,0.454545,0.431034,0.026176
27202,27222,15832,55,Iga_Swiatek,2022-11-07,-0.857143,0.501991,0,0.833333,0.862069,0.862069,0.028736
27203,27223,15832,55,Aryna_Sabalenka,2022-11-07,0.857143,0.501991,1,0.169492,0.222222,0.190476,0.020985
27204,27224,15833,55,Caroline_Garcia,2022-11-08,-0.023810,0.249051,1,0.478469,0.500000,0.502513,0.024044


In [9]:
def fit_regression(rankdist: bool = True, wikibuzz: bool = True):
    if wikibuzz and rankdist:
        results = mt.reg(fitset,'outcome',                      
        ['inverse_avg', 'rankdist', 'wikibuzz'],
        #fe_name= "year",
        cluster='match_id',addcons=True)
        params = [results.beta[-1], results.beta[0], results.beta[1], results.beta[2]]
        #params= [constant        , inverse_avg    , rankdist       , wikibuzz       ]
    elif wikibuzz and not rankdist:
        results = mt.reg(fitset,'outcome',
        ['inverse_avg', 'wikibuzz'],
        #fe_name= "year",
        cluster='match_id',addcons=True)
        params = [results.beta[-1], results.beta[0], results.beta[1]]
        #params= [constant        , inverse_avg    , wikibuzz       ]
    elif rankdist and not wikibuzz:
        results = mt.reg(fitset,'outcome',
        ['inverse_avg', 'rankdist'],
        #fe_name= "year",
        cluster='match_id',addcons=True)
        params = [results.beta[-1], results.beta[0], results.beta[1]]
        #params= [constant        , inverse_avg    , rankdist       ]
    elif not rankdist and not wikibuzz:
        results = mt.reg(fitset,'outcome',                      
        ['inverse_avg'],
        #fe_name= "year",
        cluster='match_id',addcons=True)
        params = [results.beta[-1], results.beta[0]]
        #params= [constant        , inverse_avg    ]
    print(results)
    return params

def simple_probability(params: pd.Series,
                         inverse_odds: float):
    y_hat = params[0] + params[1]*inverse_odds
    return y_hat

def estimate_probability(params: pd.Series,
                         inverse_odds: float,
                         rankdist: float = None,
                         wikibuzz: float = None) -> float:
    if rankdist and wikibuzz:
        y_hat = params[0] + params[-3]*inverse_odds + params[-2]*rankdist + params[-1]*wikibuzz
    if wikibuzz and not rankdist:
        y_hat = params[0] + params[-2]*inverse_odds + params[-1]*wikibuzz
    if rankdist and not wikibuzz:
        y_hat = params[0] + params[-2] * inverse_odds + params[-1] * rankdist
    if not rankdist and not wikibuzz:
        y_hat = params[0] + params[1]*inverse_odds
    return y_hat

def kelly_criterion(odds_implied_prob, y_hat):
    odds = 1/odds_implied_prob
    k = max(y_hat - ((1 - y_hat)/(odds - 1)), 0)
    return k

def profit_loss(kelly, odds_implied_prob, outcome):
    odds = 1 / odds_implied_prob
    if outcome == 1:
        PnL = kelly*odds - kelly
    if outcome == 0:
        PnL = -kelly
    return PnL

def show_results(testset, params, odds, rankdist: bool = True, wikibuzz: bool = True): # Odds = "avg", "best", "b365"

    # Estimate Probability.
    if rankdist and wikibuzz:
        testset['model_est_prob'] = [estimate_probability(params, row[0], row[1], row[2]) for row in zip(testset['inverse_avg'],
                                                                                                         testset['rankdist'],
                                                                                                         testset['wikibuzz'])]
    elif wikibuzz and not rankdist:
        testset['model_est_prob'] = [estimate_probability(params, inverse_odds=row[0], wikibuzz=row[1]) for row in zip(testset['inverse_avg'],
                                                                                                                       testset['wikibuzz'])]
    elif rankdist and not wikibuzz:
        testset['model_est_prob'] = [estimate_probability(params, inverse_odds=row[0], rankdist=row[1]) for row in zip(testset['inverse_avg'],
                                                                                                                       testset['rankdist'])]
    elif not rankdist and not wikibuzz:
        testset['model_est_prob'] = [simple_probability(params, inverse_odds=x) for x in testset['inverse_avg']]          
    # Determine kelly bet size.
    testset['kelly_betsize'] = [kelly_criterion(row[0], row[1]) for row in zip(testset['inverse_{}'.format(odds)],
                                                                               testset['model_est_prob'])]
    # Calculate return for the bet.
    testset['PnL'] = [profit_loss(row[0], row[1], row[2]) for row in zip(testset['kelly_betsize'],testset['inverse_{}'.format(odds)],
                                                                         testset['outcome'])]

    print('Bets considered (2 * Matches): '+str(len(testset[testset['inverse_{}'.format(odds)].notna()])))
    print('Number of bets placed: '+str(len(testset.loc[(testset["PnL"].notna()) & testset["PnL"] != 0])))
    print("Mean overround (%): "+str((testset.sort_values(by="match_id")["inverse_{}".format(odds)].rolling(2).sum()[1::2].mean()-1)*100))
    print('Absolute amount bet: '+str(testset['kelly_betsize'].sum()))
    print('Absolute return: '+str(testset['PnL'].sum()))
    print('ROI (%): '+str((((testset['PnL'].sum() + testset['kelly_betsize'].sum())/testset['kelly_betsize'].sum())-1)*100))

def plot_results():
    testset['Cum_PnL'] = testset['PnL'].cumsum() * 100
    plt.rcParams.update({'font.size': 16})
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    testset.plot('date', 'Cum_PnL', ax=ax, linewidth=2.5)
    
    ax.set_ylabel('Absolute Return (%)')
    ax.set_xlabel('Date')
    ax.get_legend().remove()
    fig.autofmt_xdate()
    
    ax.set_ylim(-100, 500)
    plt.axhline(y=0, color='grey').set_linewidth(2.5)
    
    plt.show()
    
def correct_kelly_results(odds, rankdist: bool = True, wikibuzz: bool = True): # Odds = "avg", "best", "b365"
    
    params = fit_regression(rankdist=rankdist,wikibuzz=wikibuzz)
    show_results(testset, params, odds, rankdist=rankdist, wikibuzz=wikibuzz)
    
    testset['bankroll_after'] = testset['PnL'] + 1
    testset['bankroll_after'] = testset['bankroll_after'].cumprod()
    testset['bankroll_before'] = testset.bankroll_after.shift(1)
    testset['betsize_of_inital_bankroll'] = testset['kelly_betsize'] * testset['bankroll_before']
    testset['pnl_of_initial_bankroll'] = testset['PnL'] * testset['bankroll_before']
    print("---------------")
    print("*Correct Kelly*")
    print('Bets considered (2 * Matches): '+str(len(testset[testset['inverse_{}'.format(odds)].notna()])))
    print('Number of bets placed: '+str(len(testset.loc[(testset["PnL"].notna()) & testset["PnL"] != 0])))
    print("Mean overround (%): "+str((testset.sort_values(by="match_id")["inverse_{}".format(odds)].rolling(2).sum()[1::2].mean()-1)*100))
    print('Absolute amount bet: ' + str(testset['betsize_of_inital_bankroll'].sum()))
    print('Absolute return: ' + str(testset['pnl_of_initial_bankroll'].sum()))
    print('ROI (%): ' + str((((testset['pnl_of_initial_bankroll'].sum() + testset['betsize_of_inital_bankroll'].sum()) / testset[
    'betsize_of_inital_bankroll'].sum()) - 1)*100))
    
    
def correct_kelly_plot_results():
    testset["Cum_PnL"] = (testset["bankroll_after"] -1) * 100
    plt.rcParams.update({'font.size': 16})
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    testset.plot('date', 'Cum_PnL', ax=ax, linewidth=2.5)
    ax.set_ylabel('Absolute Return (%)')
    ax.set_xlabel('Date')
    ax.get_legend().remove()
    fig.autofmt_xdate()
    ax.set_ylim(-100, 200)
    plt.axhline(y=0, color='grey').set_linewidth(2.5)
    
    plt.show()

In [11]:
params = fit_regression(rankdist=True,wikibuzz=False)
correct_kelly_results(odds="best", rankdist=True, wikibuzz=False)
#plot_results()

Dependent variable:	outcome
N:			15863
R-squared:		0.1622
Estimation method:	OLS
VCE method:		Cluster
  Cluster variable:	  match_id
  No. of clusters:	  8078
             coeff    se      t   p>t CI_low CI_high
inverse_avg  1.030 0.025 40.431 0.000  0.980   1.080
rankdist     0.054 0.031  1.752 0.080 -0.006   0.115
_cons       -0.042 0.013 -3.146 0.002 -0.069  -0.016

Dependent variable:	outcome
N:			15863
R-squared:		0.1622
Estimation method:	OLS
VCE method:		Cluster
  Cluster variable:	  match_id
  No. of clusters:	  8078
             coeff    se      t   p>t CI_low CI_high
inverse_avg  1.030 0.025 40.431 0.000  0.980   1.080
rankdist     0.054 0.031  1.752 0.080 -0.006   0.115
_cons       -0.042 0.013 -3.146 0.002 -0.069  -0.016

Bets considered (2 * Matches): 10426
Number of bets placed: 5830
Mean overround (%): -0.6420970586615837
Absolute amount bet: 174.69433861822836
Absolute return: 5.229679988407628
ROI (%): 2.9936173259949905
---------------
*Correct Kelly*
Bets considered 

In [25]:
params

[-0.03117612050374522, 1.0091558426475247, -9.970396734074412e-05]

In [110]:
%%time
rankdist = True
wikibuzz = False
odds = 'b365'
fractional_kelly = 0.5
iterations = 1000


params = fit_regression(rankdist=rankdist,wikibuzz=wikibuzz)
show_results(testset, params=params, odds=odds, rankdist=rankdist,wikibuzz=wikibuzz)
# Set initial bankroll
initial_bankroll = 1000

# Calculate cumulative returns with changing bankroll
def calculate_cumulative_returns(data, initial_bankroll):
    data['cumulative_pnl'] = 0.0
    current_bankroll_wrong = initial_bankroll
    current_bankroll_proper = initial_bankroll
    amount_bet_wrong = 0.0
    amount_bet_proper = 0.0
    
    for i, row in data.iterrows():
        adjusted_pnl = row['PnL'] * fractional_kelly
        betsize = row['kelly_betsize'] * fractional_kelly
                
        current_bankroll_wrong += adjusted_pnl * initial_bankroll
        data.at[i, 'cumulative_pnl_wrong'] = current_bankroll_wrong
        amount_bet_wrong += betsize * initial_bankroll
        data.at[i, 'cumulative_amount_bet_wrong'] = amount_bet_wrong

        current_bankroll_proper += adjusted_pnl * current_bankroll_proper
        data.at[i, 'cumulative_pnl_proper'] = current_bankroll_proper
        amount_bet_proper += betsize * current_bankroll_proper
        data.at[i, 'cumulative_amount_bet_proper'] = amount_bet_proper

    return data

absolutereturn_wrong = []
absolutereturn_proper = []

amount_bet_wrong = []
amount_bet_proper = []
strategy_stops_counter = 0

for i in range(0, iterations):
    
    testset_it = testset.sample(frac=0.75).reset_index(drop=True)
    # Call the function with your data
    result = calculate_cumulative_returns(testset_it, initial_bankroll)
    min_index = result["cumulative_pnl_wrong"].idxmin()
    if result.loc[min_index, 'cumulative_pnl_wrong'] < 0:
        absolutereturn_wrong.append(0)
        amount_bet_wrong.append(result.loc[min_index, 'cumulative_amount_bet_wrong'])
    elif (result.loc[min_index, 'cumulative_pnl_wrong'] > 0) & ((result.loc[min_index, 'kelly_betsize'] * 1000 *fractional_kelly) > result.loc[min_index, 'cumulative_pnl_wrong']):
        absolutereturn_wrong.append(result.loc[min_index, 'cumulative_pnl_wrong'])
        amount_bet_wrong.append(result.loc[min_index, 'cumulative_amount_bet_wrong'])
        strategy_stops_counter += 1
    else:
        absolutereturn_wrong.append(result["cumulative_pnl_wrong"].iloc[-1])
        amount_bet_wrong.append(result["cumulative_amount_bet_wrong"].iloc[-1])

    absolutereturn_proper.append(result["cumulative_pnl_proper"].iloc[-1])
    amount_bet_wrong.append(result["cumulative_amount_bet_proper"].iloc[-1])

cumpnls_wrong_returns = [((x- 1000)) for x in absolutereturn_wrong]
cumpnls_proper_returns = [((x- 1000)) for x in absolutereturn_proper]

Dependent variable:	outcome
N:			15863
R-squared:		0.1622
Estimation method:	OLS
VCE method:		Cluster
  Cluster variable:	  match_id
  No. of clusters:	  8078
             coeff    se      t   p>t CI_low CI_high
inverse_avg  1.030 0.025 40.431 0.000  0.980   1.080
rankdist     0.054 0.031  1.752 0.080 -0.006   0.115
_cons       -0.042 0.013 -3.146 0.002 -0.069  -0.016

Bets considered (2 * Matches): 10426
Number of bets placed: 406
Mean overround (%): 5.2346350898522465
Absolute amount bet: 8.457169908728918
Absolute return: -0.9544393094848083
ROI (%): -11.285563844468827
CPU times: total: 14min 5s
Wall time: 14min 14s


In [111]:
proper_75loss = sum(1 for x in cumpnls_proper_returns if x < -750)
improper_75loss = sum(1 for x in cumpnls_wrong_returns if x < -750)
proper_bustrate = sum(1 for x in cumpnls_proper_returns if x == -1000)
improper_bustrate = sum(1 for x in cumpnls_wrong_returns if x == -1000)

print("Proper Bust Rate: "+str(proper_bustrate))
print("Proper >75% Loss Rate: "+str(proper_75loss))
print("Proper Kelly Mean Returns: "+str(s.mean(cumpnls_proper_returns)/1000))
print("Proper Standard Deviation: "+str(np.std(cumpnls_proper_returns)/1000))
print("Proper Sharpe Ratio: "+str((s.mean(cumpnls_proper_returns) / np.std(cumpnls_proper_returns))))
print("-----")
print("Improper Bust Rate: "+str(improper_bustrate))
print("Improper >75% Loss Rate: "+str(improper_75loss))
print("Improper Kelly Mean Returns: "+str(s.mean(cumpnls_wrong_returns)/1000))
print("Improper Standard Deviation: "+str(np.std(cumpnls_wrong_returns)/1000))
print("Improper Sharpe Ratio: "+str((s.mean(cumpnls_wrong_returns) / np.std(cumpnls_wrong_returns))))
#strategystopscounter = 152

Proper Bust Rate: 0
Proper >75% Loss Rate: 0
Proper Kelly Mean Returns: -0.3149963961587429
Proper Standard Deviation: 0.07511189897098768
Proper Sharpe Ratio: -4.193695013361488
-----
Improper Bust Rate: 0
Improper >75% Loss Rate: 0
Improper Kelly Mean Returns: -0.3607648246999024
Improper Standard Deviation: 0.11011692506750582
Improper Sharpe Ratio: -3.2761977732191485


In [None]:
# Saving the lists to a file using pickle
# with open('std5_half_kelly_lists.pickle', 'wb') as file:
#     pickle.dump(cumpnls_wrong_returns, file)
#     pickle.dump(cumpnls_proper_returns, file)
    
#Loading the lists from the file using pickle
# with open('std3_half_kelly_lists.pickle', 'rb') as file:
#     cumpnls_wrong_returns = pickle.load(file)
#     cumpnls_proper_returns = pickle.load(file)    