In [4]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd

# https://www.kaggle.com/datasets/dissfya/atp-tennis-2000-2023daily-pull
dataset_path = "./../data/atp_tennis.csv"
data_raw = pd.read_csv(dataset_path)
print(data_raw.columns)
print(data_raw)

Index(['Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of',
       'Player_1', 'Player_2', 'Winner', 'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2',
       'Odd_1', 'Odd_2', 'Score'],
      dtype='object')
                               Tournament        Date         Series    Court  \
0      Australian Hardcourt Championships  2000-01-03  International  Outdoor   
1      Australian Hardcourt Championships  2000-01-03  International  Outdoor   
2      Australian Hardcourt Championships  2000-01-03  International  Outdoor   
3      Australian Hardcourt Championships  2000-01-03  International  Outdoor   
4      Australian Hardcourt Championships  2000-01-03  International  Outdoor   
...                                   ...         ...            ...      ...   
64685                          Chile Open  2025-02-28         ATP250   Indoor   
64686                          Chile Open  2025-03-01         ATP250   Indoor   
64687                          Chile Open  2025-03-01   

In [5]:
# all players

data_players = pd.DataFrame(pd.concat([data_raw['Player_1'], data_raw['Player_2']]).unique(), columns=['PlayerName'])
print(data_players.head())
data_players.to_csv('./../data/data_players.csv', index=True)

       PlayerName
0      Dosedel S.
1      Clement A.
2       Escude N.
3  Knippschild J.
4     Fromberg R.


In [6]:
# ELO calculation
def_elo = 1500


def expected_score(rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))
    
def calculate_elo(matches: pd.DataFrame, k = 32) -> pd.DataFrame:
    players = pd.concat([matches['Player_1'], matches['Player_2']]).unique()
    elo_ratings = {player: def_elo for player in players}
    for index, row in matches.iterrows():
        player1 = row['Player_1']
        player2 = row['Player_2']
        winner = row['Winner']
        
        rating1 = elo_ratings[player1]
        rating2 = elo_ratings[player2]
        
        expected1 = expected_score(rating1, rating2)
        expected2 = expected_score(rating2, rating1)
        
        if winner == player1:
            elo_ratings[player1] = rating1 + k * (1 - expected1)
            elo_ratings[player2] = rating2 + k * (0 - expected2)
        else:
            elo_ratings[player1] = rating1 + k * (0 - expected1)
            elo_ratings[player2] = rating2 + k * (1 - expected2)

    data = pd.DataFrame(list(elo_ratings.items()), columns=['PlayerName', 'Elo'])
    data = data.sort_values(by='PlayerName')
    data.reset_index(inplace=True, drop=True)
    return data


elos = []
surfaces = data_raw['Surface'].unique()
courts = data_raw['Court'].unique()
for court in courts:
    for surface in surfaces:
        data = data_raw[(data_raw['Court'] == court) & (data_raw['Surface'] == surface)]
        elo = calculate_elo(data)
        elos.append([court + surface, elo])
        
elos.append(['All', calculate_elo(data_raw)])
        
data_elo = elos[0][1]['PlayerName']

for row in elos:
    col_name = row[0]
    elo = row[1]
    data_elo = pd.merge(data_elo, elo, on='PlayerName', how='left')
    data_elo.rename(columns={'Elo': 'Elo' + col_name}, inplace=True)

data_elo.fillna(def_elo, inplace=True)

print(data_elo.head())
data_elo.to_csv('./../data/data_elo.csv', index=True)


      PlayerName  EloOutdoorHard  EloOutdoorClay  EloOutdoorGrass  \
0     Abdulla M.     1482.525924     1500.000000      1500.000000   
1        Abel M.     1448.144202     1486.483151      1489.389761   
2     Acasuso J.     1566.229343     1677.978440      1395.062325   
3  Adaktusson J.     1485.669962     1500.000000      1500.000000   
4       Agaev E.     1485.477824     1500.000000      1500.000000   

   EloOutdoorCarpet  EloIndoorHard  EloIndoorClay  EloIndoorGrass  \
0              1500    1500.000000         1500.0            1500   
1              1500    1500.000000         1500.0            1500   
2              1500    1425.177723         1500.0            1500   
3              1500    1500.000000         1500.0            1500   
4              1500    1500.000000         1500.0            1500   

   EloIndoorCarpet       EloAll  
0      1500.000000  1489.157560  
1      1500.000000  1444.498166  
2      1444.174528  1659.432095  
3      1500.000000  1488.944743  


  data_elo.fillna(def_elo, inplace=True)


In [13]:
# # HEAD to HEAD

head_to_head_dict = {}

for index, row in data_raw.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    winner = row['Winner']
    if player1 not in head_to_head_dict:
        head_to_head_dict[player1] = {}
    if player2 not in head_to_head_dict:
        head_to_head_dict[player2] = {}
    if player2 not in head_to_head_dict[player1]:
        head_to_head_dict[player1][player2] = 0
    if player1 not in head_to_head_dict[player2]:
        head_to_head_dict[player2][player1] = 0
    if winner == player1:
        head_to_head_dict[player1][player2] += 1
    else:
        head_to_head_dict[player2][player1] += 1
        


data_head_to_head = pd.DataFrame(0, index=data_players['PlayerName'], columns=data_players['PlayerName'])

for index, row in data_head_to_head.iterrows():
    for column in data_head_to_head.columns:
        if index == column:
            continue
        else:
            if index in head_to_head_dict and column in head_to_head_dict[index]:
                data_head_to_head.at[index, column] = head_to_head_dict[index][column]
            
print(data_head_to_head.head())
data_head_to_head.to_csv('./../data/data_head_to_head.csv', index=True)

PlayerName      Dosedel S.  Clement A.  Escude N.  Knippschild J.  \
PlayerName                                                          
Dosedel S.               0           0          1               0   
Clement A.               0           0          1               1   
Escude N.                2           1          0               0   
Knippschild J.           0           0          0               0   
Fromberg R.              0           1          0               0   

PlayerName      Fromberg R.  Arthurs W.  Grosjean S.  Balcells J.  Hewitt L.  \
PlayerName                                                                     
Dosedel S.                0           0            0            0          0   
Clement A.                0           2            3            1          1   
Escude N.                 0           2            2            1          1   
Knippschild J.            0           0            0            0          0   
Fromberg R.               0         