In [124]:
import pandas as pd
import numpy as np
import altair as alt
import requests
from bs4 import BeautifulSoup
import json
from sklearn.model_selection import train_test_split
from scipy.integrate import quad
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, roc_auc_score
from tqdm import tqdm

In [2]:
from true_skill_through_time import *

In [150]:
games = pd.read_parquet("data/tennis_matches_refined_tstt.parquet")

In [151]:
total_matches_df = games.winner.value_counts().add(games.loser.value_counts(), fill_value=0).sort_values()

In [152]:
total_matches_df.describe()

count    16784.000000
mean        38.882984
std        115.208091
min          1.000000
25%          1.000000
50%          3.000000
75%         13.000000
max       1425.000000
Name: count, dtype: float64

In [153]:
players_ge_40_matches_df = total_matches_df[total_matches_df>=40.0]

In [154]:
len(players_ge_40_matches_df)

2502

In [155]:
players_ge_40_matches_df = pd.DataFrame(players_ge_40_matches_df, columns=['count']).reset_index()

In [156]:
players_ge_40_matches_df = players_ge_40_matches_df.rename(columns={'index': 'player'})

In [157]:
players_ge_40_matches_df.tail()

Unnamed: 0,player,count
2497,r419,1081.0
2498,s424,1084.0
2499,f401,1134.0
2500,n409,1175.0
2501,f324,1425.0


In [10]:
#players_ge_40_matches_df.to_csv("data/tennis_players_ge_40_matches.csv", index=False)

In [158]:
players_ge_40_matches_lst = players_ge_40_matches_df.player.unique().tolist()

In [16]:
with open("data/tennis_players_ge_40_matches_lst.json", "w") as f:
    json.dump(players_ge_40_matches_lst, f, indent=4)

In [159]:
games_ge_40 = games[games.winner.isin(players_ge_40_matches_lst) | games.loser.isin(players_ge_40_matches_lst)]

In [19]:
#games_ge_40 = games_ge_40.rename(columns={'index': 'game_index'})

In [160]:
len(games_ge_40), len(games)

(310965, 326306)

### train test split (80:20) based on time for each one of 2502 players who has played at least 40 matches

In [161]:
winners = games_ge_40[['winner', 'timestamp', 'match_id']].copy()
winners['result'] = 1
winners = winners.rename(columns={'winner': 'player'})

# Create a dataframe for losers
losers = games_ge_40[['loser', 'timestamp', 'match_id']].copy()
losers['result'] = 0
losers = losers.rename(columns={'loser': 'player'})

# Concatenate winners and losers dataframes
result_df = pd.concat([winners, losers], ignore_index=True)

# Sort the resulting dataframe by timestamp
result_df = result_df.sort_values(['player', 'timestamp']).reset_index(drop=True)

In [162]:
players_ge_40_matches_df = result_df[result_df.player.isin(players_ge_40_matches_lst)].reset_index().iloc[:, 1:]

In [163]:
players_ge_40_matches_df.tail()

Unnamed: 0,player,timestamp,match_id,result
576717,z419,2020-02-16,398202003bc65z419,0
576718,z419,2020-02-23,496202009i305z419,0
576719,z419,2020-03-01,9162202005v490z419,0
576720,z419,2020-03-15,3618202003z419r09x,1
576721,z419,2020-03-15,3618202004z419dc76,1


In [164]:
players_ge_40_matches_df['player'].nunique()

2502

In [165]:
len(games_ge_40), len(winners), len(losers)

(310965, 310965, 310965)

In [187]:
len(result_df), len(players_ge_40_matches_df), len(players_ge_40_matches_df.drop_duplicates())

(621930, 576722, 576722)

In [167]:
test_size = 0.2
train_data = []
test_data = []

for player in tqdm(players_ge_40_matches_df['player'].unique()):
    player_data = players_ge_40_matches_df[players_ge_40_matches_df['player'] == player].sort_values('timestamp')
    
    # Ensure we have enough data to split
    if len(player_data) > 1:
        player_train, player_test = train_test_split(player_data, test_size=test_size, shuffle=False)
        train_data.append(player_train)
        test_data.append(player_test)
    else:
        print(f'player %s has only one match'%(player))
        # If only one match, add it to training data
        train_data.append(player_data)



100%|██████████| 2502/2502 [00:40<00:00, 61.60it/s]


In [168]:
train_df = pd.concat(train_data, ignore_index=True)
test_df = pd.concat(test_data, ignore_index=True)

In [170]:
len(train_df), len(test_df), len(games_ge_40), len(players_ge_40_matches_df)

(460385, 116337, 310965, 576722)

In [33]:
# train_df.to_csv("data/oos_eval_tennis/train.csv", index=False)
# test_df.to_csv("data/oos_eval_tennis/test.csv", index=False)

In [188]:
games_ge_40.head()

Unnamed: 0,winner,loser,match_id,timestamp
100,j665,pk14,560191505j665pk14,1915-09-07
106,w835,cl07,560191505w835cl07,1915-09-07
96,he41,af01,560191505he41af01,1915-09-07
86,hf06,j664,560191505hf06j664,1915-09-07
83,w830,i384,560191505w830i384,1915-09-07


In [189]:
train_df.head()

Unnamed: 0,player,timestamp,match_id,result
0,a002,1980-12-21,353198004a002c073,1
1,a002,1980-12-21,359198004a002v294,1
2,a002,1980-12-21,361198004k009a002,0
3,a002,1980-12-21,353198003h026a002,0
4,a002,1980-12-21,359198003m141a002,0


In [233]:
games_ge_40_train_df_w.head()

Unnamed: 0,winner,loser,match_id,timestamp_x,player,timestamp_y
0,j665,pk14,560191505j665pk14,1915-09-07,j665,1915-09-07
1,w835,cl07,560191505w835cl07,1915-09-07,w835,1915-09-07
2,he41,af01,560191505he41af01,1915-09-07,he41,1915-09-07
3,w830,i384,560191505w830i384,1915-09-07,w830,1915-09-07
4,j665,mt38,560191500j665mt38,1915-09-07,j665,1915-09-07


In [234]:
games_ge_40_train_df_w = pd.merge(games_ge_40[['winner', 'loser', 'match_id', 'timestamp']], train_df[['player', 'match_id', 'timestamp']], left_on=['winner', 'match_id', 'timestamp'], right_on=['player', 'match_id', 'timestamp'], how='inner')
games_ge_40_train_df_l = pd.merge(games_ge_40[['winner', 'loser', 'match_id', 'timestamp']], train_df[['player', 'match_id', 'timestamp']], left_on=['loser', 'match_id', 'timestamp'], right_on=['player', 'match_id', 'timestamp'], how='inner')
games_ge_40_train_df = pd.concat([games_ge_40_train_df_w[['winner', 'loser', 'match_id', 'timestamp']], games_ge_40_train_df_l[['winner', 'loser', 'match_id', 'timestamp']]]).drop_duplicates()

In [235]:
len(games_ge_40_train_df_w), len(games_ge_40_train_df_l), len(games_ge_40_train_df)

(247286, 213099, 284664)

In [236]:
games_ge_40_test_df_w = pd.merge(games_ge_40[['winner', 'loser', 'match_id', 'timestamp']], test_df[['player', 'match_id', 'timestamp']], left_on=['winner', 'match_id', 'timestamp'], right_on=['player', 'match_id', 'timestamp'], how='inner')
games_ge_40_test_df_l = pd.merge(games_ge_40[['winner', 'loser', 'match_id', 'timestamp']], test_df[['player', 'match_id', 'timestamp']], left_on=['loser', 'match_id', 'timestamp'], right_on=['player', 'match_id', 'timestamp'], how='inner')
games_ge_40_test_df = pd.concat([games_ge_40_test_df_w[['winner', 'loser', 'match_id', 'timestamp']], games_ge_40_test_df_l[['winner', 'loser', 'match_id', 'timestamp']]]).drop_duplicates()

In [237]:
len(games_ge_40_test_df_w), len(games_ge_40_test_df_l), len(games_ge_40_test_df)

(55864, 60473, 99218)

In [238]:
len(games_ge_40_train_df), len(games_ge_40_test_df), len(games_ge_40)

(284664, 99218, 310965)

In [239]:
games_ge_40_train_df.head()

Unnamed: 0,winner,loser,match_id,timestamp
0,j665,pk14,560191505j665pk14,1915-09-07
1,w835,cl07,560191505w835cl07,1915-09-07
2,he41,af01,560191505he41af01,1915-09-07
3,w830,i384,560191505w830i384,1915-09-07
4,j665,mt38,560191500j665mt38,1915-09-07


In [240]:
#del games_ge_40_train_df['player']
games_ge_40_train_df = games_ge_40_train_df.sort_values('timestamp')
#del games_ge_40_test_df['player']
games_ge_40_test_df = games_ge_40_test_df.sort_values('timestamp')

In [241]:
games_ge_40_test_unique_df = games_ge_40_test_df[~games_ge_40_test_df.match_id.isin(games_ge_40_train_df.match_id.tolist())] 

In [242]:
len(games_ge_40_test_unique_df)

26301

In [243]:
# games_ge_40_train_df.to_csv("data/oos_eval_tennis/games_ge_40_train.csv", index=False)
# games_ge_40_test_unique_df.to_csv("data/oos_eval_tennis/games_ge_40_test_unique.csv", index=False)

In [244]:
self = TrueSkillThroughTimeApplied(games_ge_40_train_df)

In [245]:
self.learn_optimal_parameters()


gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 183102.0220

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 183102.0212

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 183102.0220

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 183102.0220

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 189074.3190

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 189074.3196

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 189074.3189

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 189074.3190

gamma: 0.0260, sigma: 0.0260, beta: 0.0260, 
NLE: 182564.0185

gamma: 0.0260, sigma: 0.0260, beta: 0.0260, 
NLE: 182564.0179

gamma: 0.0260, sigma: 0.0260, beta: 0.0260, 
NLE: 182564.0185

gamma: 0.0260, sigma: 0.0260, beta: 0.0260, 
NLE: 182564.0185

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 180774.8338

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 180774.8344

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 180774.8338

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 1807

In [246]:
#NLE: 177618.9780
optimal_gamma = 0.0500
optimal_sigma = 0.5000
optimal_beta = 0.5000
self.set_optimal_parameters(gamma = optimal_gamma, sigma = optimal_sigma, beta = optimal_beta)

In [247]:
skill_curves = self.set_skill_curves()

In [248]:
self.plot_calibration()

  bucket_means = df.groupby('win_prob_bucket').agg(


In [249]:
self.plot_calibration_oos(oos_data=games_ge_40_test_unique_df)

  bucket_means = df.groupby('win_prob_bucket').agg(


In [250]:
games_ge_40_test_unique_df['roc_label'] = games_ge_40_test_unique_df.apply(lambda row: row.winner < row.loser, axis=1).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_ge_40_test_unique_df['roc_label'] = games_ge_40_test_unique_df.apply(lambda row: row.winner < row.loser, axis=1).astype(int)


In [251]:
def assign_players(row):
    p1, p2 = sorted([row['winner'], row['loser']])
    return pd.Series([p1, p2], index=['player1', 'player2'])

In [252]:
games_ge_40_test_unique_df[['player1', 'player2']] = games_ge_40_test_unique_df.apply(assign_players, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_ge_40_test_unique_df[['player1', 'player2']] = games_ge_40_test_unique_df.apply(assign_players, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_ge_40_test_unique_df[['player1', 'player2']] = games_ge_40_test_unique_df.apply(assign_players, axis=1)


In [253]:
curves_map = {k: {t: n for t, n in v} for k, v in skill_curves.items()}
last_curves_map = {k: v[-1][1] for k, v in self.skill_curves.items()}

In [254]:
df = []
for _, row in games_ge_40_test_unique_df.iterrows():
    c1, c2 = row['player1'], row['player2']
    if c1 in last_curves_map and c2 in last_curves_map:
        normal_1, normal_2 = last_curves_map[c1], last_curves_map[c2]
        mu_diff = normal_1.mu - normal_2.mu
        sigma2_diff = normal_1.sigma ** 2 + normal_2.sigma ** 2 + 2 * (self.beta_optimal ** 2)
        #use norm.cdf to speed up the prob calculation, P(X > 0) = 1 - P(X ≤ 0)
        c1_win_prob = 1 - norm.cdf(0, mu_diff, sigma2_diff ** .5)
        df.append([c1, c2, c1_win_prob])
df = pd.DataFrame(df, columns=['player1', 'player2', 'player1_win_prob']).dropna()

In [260]:
merged_df = pd.merge(games_ge_40_test_unique_df, df, on=['player1', 'player2']).drop_duplicates()

In [261]:
len(games_ge_40_test_unique_df), len(df), len(merged_df)

(26301, 25062, 25062)

In [262]:
merged_df.roc_label.value_counts()

roc_label
1    12616
0    12446
Name: count, dtype: int64

In [263]:
roc_auc_score(merged_df.roc_label, merged_df.player1_win_prob)

np.float64(0.713066331141527)