In [5]:
import pandas as pd
import numpy as np
import altair as alt
import requests
from bs4 import BeautifulSoup
import json
from sklearn.metrics import roc_auc_score

In [6]:
from true_skill_through_time import *

In [3]:
tennis_data_raw = pd.read_csv("data/tennis_history.csv", low_memory=False)

In [6]:
tennis_data_raw.head()

Unnamed: 0,match_id,double,round_number,w1_id,w1_name,w2_id,w2_name,l1_id,l1_name,l2_id,l2_name,time_start,time_end,ground,tour_id,tour_name
0,580191504sv86bq82,f,4,sv86,bert-st.-john,,,bq82,l.-bonnington,,,1915-08-16,1915-08-21,Hard,580,australasian-championships
1,580191503sv86pj71,f,3,sv86,bert-st.-john,,,pj71,f.-peach,,,1915-08-16,1915-08-21,Hard,580,australasian-championships
2,580191503gh75gi10,f,3,gh75,r.-goodman,,,gi10,m.-graham,,,1915-08-16,1915-08-21,Hard,580,australasian-championships
3,580191503lg52br41,f,3,lg52,f.-lendrum,,,br41,unknown-briggs,,,1915-08-16,1915-08-21,Hard,580,australasian-championships
4,580191503sv16hf23,f,3,sv16,w.h.-smith,,,hf23,g.-highett,,,1915-08-16,1915-08-21,Hard,580,australasian-championships


In [9]:
tennis_data_raw.columns

Index(['match_id', 'double', 'round_number', 'w1_id', 'w1_name', 'w2_id',
       'w2_name', 'l1_id', 'l1_name', 'l2_id', 'l2_name', 'time_start',
       'time_end', 'ground', 'tour_id', 'tour_name'],
      dtype='object')

In [8]:
len(tennis_data_raw)

447028

In [23]:
unique_id_lst = list(set(list(tennis_data_raw.w1_id.unique()) + list(tennis_data_raw.l1_id.unique())))

In [25]:
tennis_data_raw.w1_id.nunique(), tennis_data_raw.l1_id.nunique(), len(unique_id_lst)

(9863, 18559, 18567)

### use single games only, use time_end as game time

In [4]:
tennis_data_df = tennis_data_raw[tennis_data_raw.double=='f'][['match_id', 'w1_id', 'l1_id', 'time_end']].dropna()

In [66]:
tennis_data_df.shape

(326306, 4)

In [49]:
tennis_data_df.head()

Unnamed: 0,match_id,w1_id,l1_id,time_end
0,580191504sv86bq82,sv86,bq82,1915-08-21
1,580191503sv86pj71,sv86,pj71,1915-08-21
2,580191503gh75gi10,gh75,gi10,1915-08-21
3,580191503lg52br41,lg52,br41,1915-08-21
4,580191503sv16hf23,sv16,hf23,1915-08-21


In [5]:
tennis_player_id_map = {**dict(zip(tennis_data_raw.w1_id, tennis_data_raw.w1_name)),
                       **dict(zip(tennis_data_raw.w2_id, tennis_data_raw.w2_name)),
                       **dict(zip(tennis_data_raw.l1_id, tennis_data_raw.l1_name)),
                       **dict(zip(tennis_data_raw.l1_id, tennis_data_raw.l1_name))}

In [11]:
tennis_player_id_inv_map = {v: k for k, v in tennis_player_id_map.items()}

In [6]:
tennis_player_id_map[np.nan]

nan

In [52]:
tennis_player_id_map['sv86']

'bert-st.-john'

In [53]:
len(tennis_player_id_map), len(list(set(list(tennis_player_id_map.keys()))))

(18655, 18655)

In [54]:
verbosed_id_lst = [ele for ele in list(tennis_player_id_map.keys()) if ele not in unique_id_lst]

In [58]:
len(verbosed_id_lst)

87

In [55]:
verbosed_id_lst.remove(np.nan)

In [56]:
tennis_data_raw[tennis_data_raw.w1_id.isin(verbosed_id_lst)]

Unnamed: 0,match_id,double,round_number,w1_id,w1_name,w2_id,w2_name,l1_id,l1_name,l2_id,l2_name,time_start,time_end,ground,tour_id,tour_name


In [57]:
tennis_data_raw[tennis_data_raw.l1_id.isin(verbosed_id_lst)]

Unnamed: 0,match_id,double,round_number,w1_id,w1_name,w2_id,w2_name,l1_id,l1_name,l2_id,l2_name,time_start,time_end,ground,tour_id,tour_name


In [67]:
tennis_data_df.time_end = pd.to_datetime(tennis_data_df.time_end)

In [74]:
tennis_data_df.head()

Unnamed: 0,match_id,winner,loser,timestamp
0,580191504sv86bq82,sv86,bq82,1915-08-21
15,580191500lf73rf87,lf73,rf87,1915-08-21
13,580191501rf87tf06,rf87,tf06,1915-08-21
12,580191502sv86gh75,sv86,gh75,1915-08-21
11,580191502lf73hf22,lf73,hf22,1915-08-21


In [69]:
tennis_data_df = tennis_data_df.rename(columns={'w1_id': 'winner', 'l1_id': 'loser', 'time_end': 'timestamp'})

In [73]:
tennis_data_df = tennis_data_df.sort_values('timestamp')

In [21]:
tennis_data_df.head()

Unnamed: 0,match_id,winner,loser,timestamp,time_0_to_999_int,player1,player2,roc_label
0,580191504sv86bq82,sv86,bq82,1915-08-21,0,bq82,sv86,0
15,580191500lf73rf87,lf73,rf87,1915-08-21,0,lf73,rf87,1
13,580191501rf87tf06,rf87,tf06,1915-08-21,0,rf87,tf06,1
12,580191502sv86gh75,sv86,gh75,1915-08-21,0,gh75,sv86,0
11,580191502lf73hf22,lf73,hf22,1915-08-21,0,hf22,lf73,0


In [22]:
#tennis_data_df[['winner', 'loser', 'match_id', 'timestamp']].to_parquet("data/tennis_matches_refined_tstt.parquet")

In [7]:
#tennis_data_df.to_parquet("data/tennis_matches_refined_tstt.parquet")
tennis_data_df = pd.read_parquet("data/tennis_matches_refined_tstt.parquet")

In [8]:
self = TrueSkillThroughTimeApplied(tennis_data_df)

In [9]:
print(tennis_data_df.head().to_string())

             match_id winner loser  timestamp  time_0_to_999_int
0   580191504sv86bq82   sv86  bq82 1915-08-21                  0
15  580191500lf73rf87   lf73  rf87 1915-08-21                  0
13  580191501rf87tf06   rf87  tf06 1915-08-21                  0
12  580191502sv86gh75   sv86  gh75 1915-08-21                  0
11  580191502lf73hf22   lf73  hf22 1915-08-21                  0


In [10]:
self.learn_optimal_parameters()


gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 211533.7151

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 211533.7142

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 211533.7151

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 211533.7151

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 217065.1003

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 217065.1009

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 217065.1002

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 217065.1002

gamma: 0.0270, sigma: 0.0270, beta: 0.0270, 
NLE: 210587.3144

gamma: 0.0270, sigma: 0.0270, beta: 0.0270, 
NLE: 210587.3137

gamma: 0.0270, sigma: 0.0270, beta: 0.0270, 
NLE: 210587.3144

gamma: 0.0270, sigma: 0.0270, beta: 0.0270, 
NLE: 210587.3144

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 208831.8066

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 208831.8073

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 208831.8066

gamma: 0.0500, sigma: 0.0500, beta: 0.0500, 
NLE: 2088

In [10]:
#NLE: 205119.0011
optimal_gamma = 0.0500
optimal_sigma = 0.5000
optimal_beta = 0.5060
self.set_optimal_parameters(gamma = optimal_gamma, sigma = optimal_sigma, beta = optimal_beta)

In [11]:
skill_curves = self.set_skill_curves()

In [15]:
top_players_names_lst = [
    "Novak Djokovic",
	"Roger Federer",
	"Pete Sampras",
	"Ivan Lendl",
	"Jimmy Connors",
	"Rafael Nadal",
	"John McEnroe",
	"Bj\"orn Borg",
	"Andre Agassi",
	"Lleyton Hewitt",
	"Stefan Edberg",
	"Jim Courier",
	"Gustavo Kuerten",
	"Andy Murray",
    "Ilie N\u{a}stase",
    "Mats Wilander"
]

In [19]:
top_players_lst = [
    "novak-djokovic",
    "roger-federer",
    "pete-sampras",
    "ivan-lendl",
    "jimmy-connors",
    "rafael-nadal",
    "john-mcenroe",
    "bjorn-borg",
    "andre-agassi",
    "lleyton-hewitt",
    "stefan-edberg",
    "jim-courier",
    "gustavo-kuerten",
    "andy-murray",
    "ilie-nastase",
    "mats-wilander"
]

In [20]:
top_players_id_lst = [
    tennis_player_id_inv_map[player] for player in top_players_lst
]

In [28]:
tennis_player_id_map['c243'], tennis_player_id_map['n409'], tennis_player_id_map['s402'], tennis_player_id_map['d643']

('jim-courier', 'rafael-nadal', 'pete-sampras', 'novak-djokovic')

In [23]:
top_players_map = {
   'd643': "novak-djokovic",
   'f324': "roger-federer",
   's402': "pete-sampras",
   'l018': "ivan-lendl",
   'c044': "jimmy-connors",
   'n409': "rafael-nadal",
   'm047': "john-mcenroe",
   'b058': "bjorn-borg",
   'a092': "andre-agassi",
   'h432': "lleyton-hewitt",
   'e004': "stefan-edberg",
   'c243': "jim-courier",
   'k293': "gustavo-kuerten",
   'mc10': "andy-murray",
   'n008': "ilie-nastase",
   'w023': "mats-wilander"
}


In [24]:
top_players_id_lst

['d643',
 'f324',
 's402',
 'l018',
 'c044',
 'n409',
 'm047',
 'b058',
 'a092',
 'h432',
 'e004',
 'c243',
 'k293',
 'mc10',
 'n008',
 'w023']

In [22]:
self.plot_player_skills(players = top_players_id_lst, width=1500, burnin=0)

### GOAT is d643 novak-djokovic; f324 roger-federer, n409 rafael-nadal share the runner-up in the recent days 

In [23]:
self.plot_calibration()

  bucket_means = df.groupby('win_prob_bucket').agg(


In [12]:
def assign_players(row):
    p1, p2 = sorted([row['winner'], row['loser']])
    return pd.Series([p1, p2], index=['player1', 'player2'])

In [13]:
tennis_data_df[['player1', 'player2']] = tennis_data_df.apply(assign_players, axis=1)

In [14]:
tennis_data_df['roc_label'] = tennis_data_df.apply(lambda row: row.winner < row.loser, axis=1).astype(int)

In [15]:
curves_map = {k: {t: n for t, n in v} for k, v in skill_curves.items()}

In [16]:
df = []
for _, row in tennis_data_df.iterrows():
    c1, c2, t_int = row['player1'], row['player2'], row['time_0_to_999_int']
    if c1 in curves_map and c2 in curves_map:
        if t_int in curves_map[c1] and t_int in curves_map[c2]:
            normal_1, normal_2 = curves_map[c1][t_int], curves_map[c2][t_int]
            mu_diff = normal_1.mu - normal_2.mu
            sigma2_diff = normal_1.sigma ** 2 + normal_2.sigma ** 2 + 2 * (self.beta_optimal ** 2)
            c1_win_prob = 1 - norm.cdf(0, mu_diff, sigma2_diff ** .5)
            df.append([c1, c2, t_int, c1_win_prob])
df = pd.DataFrame(df, columns=['player1', 'player2', 'time_0_to_999_int', 'player1_win_prob']).dropna()

In [17]:
merged_df = pd.merge(tennis_data_df, df, on=['player1', 'player2', 'time_0_to_999_int'])

In [18]:
merged_df.roc_label.value_counts()

roc_label
1    170728
0    166768
Name: count, dtype: int64

In [19]:
roc_auc_score(merged_df.roc_label, merged_df.player1_win_prob)

np.float64(0.8135042644716313)