In [1]:
import pandas as pd
import numpy as np

In [7]:
DATA_PATH = "data/atp_matches_"

In [4]:
atp_2022_df = pd.read_csv(DATA_PATH + '2022.csv')

## CLEANING DATAFRAME

! For BTL model !
Removing players with less than N games played

In [58]:
def clean_df(df):

    # SORT BY "TOURNEY DATE"
    
    pruned_df = prune(df)

    cleaned_df = pruned_df
    return cleaned_df

def prune(df):
    v_w = atp_2022_df['winner_id'].value_counts()
    v_l = atp_2022_df['loser_id'].value_counts()

    v_total = pd.merge(v_w, v_l, left_index=True, right_index=True, how="outer").fillna(0).sum(axis=1)

    pruned_df = df.loc[ (df['winner_id'].isin(v_total.index[v_total.gt(4)])) 
                        & (df['loser_id'].isin(v_total.index[v_total.gt(4)]))]

    return pruned_df

In [46]:
v_w = atp_2022_df['winner_id'].value_counts()# .to_frame()
v_l = atp_2022_df['loser_id'].value_counts()# .to_frame()

# merged_df = v_w.join(v_l, lsuffix="left", rsuffix="right", how="outer")
v_total = pd.merge(v_w, v_l, left_index=True, right_index=True, how="outer").fillna(0).sum(axis=1)

In [59]:
clean_2022_df = clean_df(atp_2022_df)

## Compiling complete player index

In [146]:
def get_players(df: pd.DataFrame, base_rating: np.float32) -> pd.DataFrame:
    winner_ids = df[['winner_id', 'winner_name']].drop_duplicates().rename(columns={'winner_id': 'id', 'winner_name': 'name'})
    loser_ids = df[['loser_id', 'loser_name']].drop_duplicates().rename(columns={'loser_id': 'id', 'loser_name': 'name'})
    
    player_ids = pd.concat([winner_ids, loser_ids]).set_index('id')

    # For some reason concat doesn't merge correctly so dupes need to be dropped manually
    player_ids.drop_duplicates(inplace=True)

    # Initializing rating column with base rating
    player_ids['rating'] = base_rating
    # Initializing games-played column with 0
    player_ids['games'] = 0

    return player_ids

In [131]:
# FOR BTL MODEL
# 1200 OR 1500?
# player_info = get_players(clean_2022_df, 1200)
# player_info

## ELO MODEL

In [161]:

def K_factor(rating: np.float32) -> np.float32:
    K = 32

    if rating > 2100 and rating < 2400:
        K = 24
    elif rating > 2400:
        K = 16
    
    return K


def build_elo(player_df: pd.DataFrame, training_data: pd.DataFrame) -> pd.DataFrame:

    def get_score_info(match_df: pd.DataFrame) -> dict:
        score = match_df['score']
        bestof = match_df['best_of']

        if "RET" in score:
            # print("RET found")
            # THROW ERROR ?
            return None
        
        rounds_played = len(score.split())
        
        winner_score = np.ceil(bestof / 2)
        loser_score = rounds_played - winner_score

        score_info = {
            'Winner': winner_score,
            'Loser': loser_score,
            'Total': bestof
        }

        return score_info

    def calc_dElo(r_a: np.float32, r_b: np.float32, score: dict) -> tuple[np.float32, np.float32]:

        E_a = get_EV(r_a, r_b)
        E_b = 1 - E_a

        S_a = score['Winner'] / score['Total']
        S_b = 1 - S_a

        d_a = K_factor(r_a) * (S_a - E_a)
        d_b = K_factor(r_b) * (S_b - E_b)

        # r_a += d_a
        # r_b += d_b

        return (d_a, d_b)

    def get_EV(r_a: np.float32, r_b: np.float32, base=10, D=400) -> np.float32:
        # Implements Elo formula and returns expected score for winner
        Q_a = base ** (r_a / D)
        Q_b = base ** (r_b / D)

        E_a = Q_a / (Q_a + Q_b)

        return E_a
    

    RET_COUNT = 0
    # SORT ROWS BY "TOURNEY_DATE"
    for idx, row in training_data.iterrows():
        winner_id, loser_id = row['winner_id'], row['loser_id']

        match_df = row[['score', 'best_of']]
        score_info = get_score_info(match_df)

        # IF RET -> SKIP ROW
        if not score_info:
            RET_COUNT += 1
            continue

        # Getting current ratings
        r_winner, r_loser = player_df.at[winner_id, 'rating'], player_df.at[loser_id, 'rating']

        # Calculating change in Elo rating
        d_winner, d_loser = calc_dElo(r_winner, r_loser, score_info)


        # Updating Elo rating
        player_df.at[winner_id, 'rating'] += d_winner
        player_df.at[loser_id, 'rating'] += d_loser

        # Updating games played
        player_df.at[winner_id, 'games'] += 1
        player_df.at[winner_id, 'games'] += 1

    return player_df

In [162]:
# Initializing player-rating df
# 1200 OR 1500 ?
player_info = get_players(atp_2022_df, 1200.0)

In [163]:
player_info

Unnamed: 0_level_0,name,rating,games
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200000,Felix Auger Aliassime,1200.0,0
133430,Denis Shapovalov,1200.0,0
105138,Roberto Bautista Agut,1200.0,0
105807,Pablo Carreno Busta,1200.0,0
106421,Daniil Medvedev,1200.0,0
...,...,...,...
105747,Karim Mohamed Maamoun,1200.0,0
210308,Stylianos Christodoulou,1200.0,0
105278,Alexis Klegou,1200.0,0
200583,Delmas Ntcha,1200.0,0


In [164]:
updated_player_df = build_elo(player_info, atp_2022_df)

In [166]:
updated_player_df.sort_values(by=['rating'], ascending=False)

Unnamed: 0_level_0,name,rating,games
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
104925,Novak Djokovic,1308.371850,84
200000,Felix Auger Aliassime,1282.372158,120
207989,Carlos Alcaraz,1276.588289,112
208029,Holger Rune,1273.344850,72
106401,Nick Kyrgios,1268.121729,78
...,...,...,...
127157,Daniel Altmaier,1158.695641,10
200615,Alexei Popyrin,1157.315238,8
103852,Feliciano Lopez,1157.041454,0
105332,Benoit Paire,1154.906361,8


## Using elo_model Class

In [1]:
import pandas as pd
import numpy as np

from elo_model import EloModel
from data_forms import TrainingData
from heuristics import probabilistic_elo_init as init_heur1

DATA_PATH = "data/atp_matches_"
atp_2022_df = pd.read_csv(DATA_PATH + '2022.csv')
atp_2023_df = pd.read_csv(DATA_PATH + '2023.csv')

matches_22to23 = TrainingData([atp_2022_df, atp_2023_df])
matches_22to23.partition_data(1)

In [2]:
vanilla_elo = EloModel(matches_22to23, 1500.0)
heurInit_elo = EloModel(matches_22to23, 1500.0, init_heuristics = init_heur1)

In [3]:
heurInit_elo.ratings.sort_values('rating', ascending=False)

Unnamed: 0_level_0,name,rating,games
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
104925,Novak Djokovic,1801.331067,0
207989,Carlos Alcaraz,1801.331067,0
104745,Rafael Nadal,1713.441044,0
106421,Daniil Medvedev,1704.291942,0
106401,Nick Kyrgios,1695.246656,0
...,...,...,...
103852,Feliciano Lopez,1220.411998,0
106378,Kyle Edmund,1220.411998,0
126340,Viktor Durasovic,1198.668933,0
105332,Benoit Paire,1198.668933,0


In [4]:
vanilla_elo.update_elo(matches_22to23.partitioned_data['Training'])
vanilla_elo.ratings.sort_values(by=['rating'], ascending=False)

Unnamed: 0_level_0,name,rating,games
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
207989,Carlos Alcaraz,1610.197169,121
104925,Novak Djokovic,1599.859224,87
106421,Daniil Medvedev,1589.004201,120
206173,Jannik Sinner,1587.216994,114
208029,Holger Rune,1575.125314,111
...,...,...,...
202358,Chun Hsin Tseng,1463.560861,17
124079,Pedro Martinez,1458.055640,59
104269,Fernando Verdasco,1456.018112,21
105967,Henri Laaksonen,1453.819137,17


In [5]:
heurInit_elo.update_elo(matches_22to23.data)
heurInit_elo.ratings.sort_values(by=['rating'], ascending=False)

Unnamed: 0_level_0,name,rating,games
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
104925,Novak Djokovic,1614.869077,91
207989,Carlos Alcaraz,1602.491120,126
106401,Nick Kyrgios,1588.335154,52
106421,Daniil Medvedev,1580.100703,122
104745,Rafael Nadal,1575.081183,52
...,...,...,...
105613,Norbert Gombos,1311.001424,11
106378,Kyle Edmund,1298.861091,12
105967,Henri Laaksonen,1298.350876,17
105062,Mikhail Kukushkin,1273.690609,11


In [5]:
def get_players(matches: pd.DataFrame):
    winner_ids = matches[['winner_id', 'winner_name']].drop_duplicates().rename(columns={'winner_id': 'id', 'winner_name': 'name'})
    loser_ids = matches[['loser_id', 'loser_name']].drop_duplicates().rename(columns={'loser_id': 'id', 'loser_name': 'name'})
    
    players = pd.concat([winner_ids, loser_ids]).drop_duplicates().set_index('id')

    return players



training_df = matches_22to23.partitioned_data['Training']
testing_df = matches_22to23.partitioned_data['Testing']
training_players = get_players(training_df)
testing_players = get_players(testing_df)

In [2]:
from elo_model import evaluate_EloModel

In [3]:
heurInit_eval = evaluate_EloModel(matches_22to23, init_heuristics = init_heur1)
heurInit_eval

{'N': 55,
 'CE': 0.6596622237202288,
 'Accuracy': 0.6545454545454545,
 'BS': 0.23349345317196832}

In [4]:
vanilla_eval = evaluate_EloModel(matches_22to23)
vanilla_eval

{'N': 55,
 'CE': 0.6599451564243387,
 'Accuracy': 0.6727272727272727,
 'BS': 0.2336053129324581}

In [9]:
from sklearn.metrics import log_loss

winners_OHE = np.tile([1, 0], (55, 1))
random_preds = np.tile([0.5, 0.5], (55, 1))

random_CE = log_loss(winners_OHE, random_preds)
print(random_CE)

0.6931471805599455


In [10]:
def atp_points_winner(matches: pd.DataFrame):
    N = matches.shape[0]
    predicted_wins = matches[matches['winner_rank_points'] > matches['loser_rank_points']].shape[0]

    return predicted_wins / N

atp_points_winner(matches_22to23.partitioned_data['Testing'])

0.6909090909090909

In [3]:
elo_rankings = EloModel(matches_22to23.data, 1500.0)

In [23]:
master_df = matches_22to23.data

In [4]:
elo_rankings.update_elo(matches_22to23.data)
elo_rankings.ratings.sort_values(by=['rating'], ascending=False)

Unnamed: 0_level_0,name,rating,games
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
104925,Novak Djokovic,1615.337511,158
207989,Carlos Alcaraz,1610.483807,216
106421,Daniil Medvedev,1586.839918,186
100644,Alexander Zverev,1581.374282,130
206173,Jannik Sinner,1578.832783,176
...,...,...,...
202358,Chun Hsin Tseng,1463.560861,4
124079,Pedro Martinez,1458.055640,44
104269,Fernando Verdasco,1456.018112,10
105967,Henri Laaksonen,1453.819137,4


In [8]:
master_df = matches_22to23.data
# master_df.groupby(['tourney_id'])['tourney_id']
master_df['tourney_id'].unique()[-3: -1]

array(['2023-0421', '2023-0422'], dtype=object)