In [18]:
import pandas as pd
import numpy as np
import altair as alt
import requests
from bs4 import BeautifulSoup
import json
from sklearn.model_selection import train_test_split
from scipy.integrate import quad
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, roc_auc_score

In [4]:
from true_skill_through_time import *

In [5]:
games_ge_40_train_df = pd.read_csv("data/oos_eval/games_ge_40_train.csv")
games_ge_40_test_df = pd.read_csv("data/oos_eval/games_ge_40_test.csv")

In [6]:
games_ge_40_train_df.timestamp = pd.to_datetime(games_ge_40_train_df.timestamp)
games_ge_40_test_df.timestamp = pd.to_datetime(games_ge_40_test_df.timestamp)

In [7]:
len(games_ge_40_train_df)

16946

In [8]:
self = TrueSkillThroughTimeApplied(games_ge_40_train_df)

In [9]:
print(games_ge_40_train_df.head().to_string())

   game_index            winner                     loser  timestamp  time_0_to_999_int
0           3  John L. Sullivan               Jack Curley 1879-03-13                  0
1           4  John L. Sullivan        Johnny Cocky Woods 1879-03-14                  0
2           6  John L. Sullivan              George Rooke 1879-06-28                  2
3           7  John L. Sullivan                 Dan Dwyer 1879-06-28                  2
4           9  John L. Sullivan  Professor John Donaldson 1880-12-24                 12


In [8]:
self.learn_optimal_parameters()


gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 6257.7669

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 6257.7669

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 6257.7669

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 6257.7669

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 11375.0479

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 11375.0480

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 11375.0479

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 11375.0479

gamma: 0.0281, sigma: 0.0281, beta: 0.0281, 
NLE: 5976.7386

gamma: 0.0281, sigma: 0.0281, beta: 0.0281, 
NLE: 5976.7386

gamma: 0.0281, sigma: 0.0281, beta: 0.0281, 
NLE: 5976.7386

gamma: 0.0281, sigma: 0.0281, beta: 0.0281, 
NLE: 5976.7386

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5902.5022

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5902.5022

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5902.5022

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5902.5022

gamma: 0.1000, sigm

In [10]:
#NLE: 5787.2494
optimal_gamma = 0.0740
optimal_sigma = 0.6008
optimal_beta = 0.2849

In [11]:
self.set_optimal_parameters(gamma = optimal_gamma, sigma = optimal_sigma, beta = optimal_beta)

In [12]:
skill_curves = self.set_skill_curves()

In [13]:
# https://sports.betmgm.com/en/blog/ranking-the-worlds-top-10-boxers/
# supringly Mike Tyson is not on this list
top_boxers_online = ['Muhammad Ali', 'Joe Louis', 'Sugar Ray Robinson', 'Rocky Marciano', 'Floyd Mayweather Jr', 'Manny Pacquiao', 'Jack Dempsey', 'Roberto Durán', 'Henry Armstrong', 'Willie Pep']

In [13]:
self.plot_calibration()

  bucket_means = df.groupby('win_prob_bucket').agg(


### remove games that appear in both train and test set(to avoid data leak)

In [14]:
games_ge_40_test_unique_df = games_ge_40_test_df[~games_ge_40_test_df.game_index.isin(games_ge_40_train_df.game_index.tolist())] 

### for those not on the skill_curve(because they are not in the training set, drop them)

In [15]:
len(games_ge_40_test_unique_df), len(games_ge_40_test_df)

(4129, 4387)

In [17]:
self.plot_calibration_oos(oos_data=games_ge_40_test_df)

  bucket_means = df.groupby('win_prob_bucket').agg(


In [16]:
self.plot_calibration_oos(oos_data=games_ge_40_test_unique_df)

  bucket_means = df.groupby('win_prob_bucket').agg(


### roc_auc_score on the unique test set, use the last available mu and sigma to compute win_prob

In [17]:
games_ge_40_test_unique_df.head()

Unnamed: 0,game_index,winner,loser,timestamp
0,67,John L. Sullivan,Alf Greenfield,1885-01-12
1,74,John L. Sullivan,Jack Burke,1885-06-13
2,77,John L. Sullivan,Dominick McCaffrey,1885-08-29
3,104,John L. Sullivan,Frank Herald,1886-09-18
4,108,John L. Sullivan,Paddy Ryan,1886-11-13


In [26]:
games_ge_40_test_unique_df['roc_label'] = games_ge_40_test_unique_df.apply(lambda row: row.winner < row.loser, axis=1).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_ge_40_test_unique_df['roc_label'] = games_ge_40_test_unique_df.apply(lambda row: row.winner < row.loser, axis=1).astype(int)


In [36]:
def assign_players(row):
    p1, p2 = sorted([row['winner'], row['loser']])
    return pd.Series([p1, p2], index=['player1', 'player2'])

In [37]:
games_ge_40_test_unique_df[['player1', 'player2']] = games_ge_40_test_unique_df.apply(assign_players, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_ge_40_test_unique_df[['player1', 'player2']] = games_ge_40_test_unique_df.apply(assign_players, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_ge_40_test_unique_df[['player1', 'player2']] = games_ge_40_test_unique_df.apply(assign_players, axis=1)


In [22]:
curves_map = {k: {t: n for t, n in v} for k, v in skill_curves.items()}
last_curves_map = {k: v[-1][1] for k, v in self.skill_curves.items()}

In [42]:
df = []
for _, row in games_ge_40_test_unique_df.iterrows():
    c1, c2 = row['player1'], row['player2']
    if c1 in last_curves_map and c2 in last_curves_map:
        normal_1, normal_2 = last_curves_map[c1], last_curves_map[c2]
        mu_diff = normal_1.mu - normal_2.mu
        sigma2_diff = normal_1.sigma ** 2 + normal_2.sigma ** 2 + 2 * (self.beta_optimal ** 2)
        #use norm.cdf to speed up the prob calculation, P(X > 0) = 1 - P(X ≤ 0)
        c1_win_prob = 1 - norm.cdf(0, mu_diff, sigma2_diff ** .5)
        df.append([c1, c2, c1_win_prob])
df = pd.DataFrame(df, columns=['player1', 'player2', 'player1_win_prob']).dropna()

In [43]:
merged_df = pd.merge(games_ge_40_test_unique_df, df, on=['player1', 'player2'])

In [47]:
merged_df['player1_mu'] = merged_df.apply(lambda row: last_curves_map[row.player1].mu, axis=1)
merged_df['player2_mu'] = merged_df.apply(lambda row: last_curves_map[row.player2].mu, axis=1)
merged_df['player1_mu_greater'] = merged_df.apply(lambda row: row.player1_mu > row.player2_mu, axis=1).astype(int)

In [48]:
merged_df.head()

Unnamed: 0,game_index,winner,loser,timestamp,roc_label,player1,player2,player1_win_prob,player1_mu,player2_mu,player1_mu_greater
0,67,John L. Sullivan,Alf Greenfield,1885-01-12,0,Alf Greenfield,John L. Sullivan,0.568026,-0.030131,-0.258278,1
1,74,John L. Sullivan,Jack Burke,1885-06-13,0,Jack Burke,John L. Sullivan,0.563801,-0.044322,-0.258278,1
2,108,John L. Sullivan,Paddy Ryan,1886-11-13,1,John L. Sullivan,Paddy Ryan,0.445887,-0.258278,-0.07845,0
3,147,"Jack ""Nonpareil"" Dempsey",Billy Baker,1888-02-18,0,Billy Baker,"Jack ""Nonpareil"" Dempsey",0.07733,-0.142781,0.93754,0
4,163,John L. Sullivan,Jake Kilrain,1889-07-08,0,Jake Kilrain,John L. Sullivan,0.54751,-0.101239,-0.258278,1


In [46]:
merged_df.roc_label.value_counts()

roc_label
1    1449
0    1249
Name: count, dtype: int64

In [45]:
roc_auc_score(merged_df.roc_label, merged_df.player1_win_prob)

np.float64(0.7421368426694428)

In [49]:
roc_auc_score(merged_df.roc_label, merged_df.player1_mu_greater)

np.float64(0.6872559469245513)

In [50]:
#merged_df.to_csv('data/boxing_oos_analysis.csv', index=False)