In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import shutil
import random

source_path = kagglehub.dataset_download("dissfya/atp-tennis-2000-2023daily-pull")
destination_path = "./../data"
shutil.copytree(source_path, destination_path, dirs_exist_ok=True)


data_raw = pd.read_csv(destination_path + '/atp_tennis.csv')
last_day = data_raw['Date'].max()
print(f'Last day in the dataset: {last_day}')

        
                
data_raw.to_csv("./../data/data_raw.csv", index=False)
print(data_raw.head())
print(data_raw.dtypes)


Last day in the dataset: 2025-03-16
Processing row 0 of 64781
Processing row 1000 of 64781
Processing row 2000 of 64781
Processing row 3000 of 64781
Processing row 4000 of 64781
Processing row 5000 of 64781
Processing row 6000 of 64781
Processing row 7000 of 64781
Processing row 8000 of 64781
Processing row 9000 of 64781
Processing row 10000 of 64781
Processing row 11000 of 64781
Processing row 12000 of 64781
Processing row 13000 of 64781
Processing row 14000 of 64781
Processing row 15000 of 64781
Processing row 16000 of 64781
Processing row 17000 of 64781
Processing row 18000 of 64781
Processing row 19000 of 64781
Processing row 20000 of 64781
Processing row 21000 of 64781
Processing row 22000 of 64781
Processing row 23000 of 64781
Processing row 24000 of 64781
Processing row 25000 of 64781
Processing row 26000 of 64781
Processing row 27000 of 64781
Processing row 28000 of 64781
Processing row 29000 of 64781
Processing row 30000 of 64781
Processing row 31000 of 64781
Processing row 32

In [5]:
# all players

data_players = pd.DataFrame(pd.concat([data_raw['Player_1'], data_raw['Player_2']]).unique(), columns=['PlayerName'])
print(data_players.head())
data_players.to_csv('./../data/data_players.csv', index=True)

       PlayerName
0      Dosedel S.
1      Clement A.
2       Escude N.
3  Knippschild J.
4     Fromberg R.


In [None]:
# ELO calculation
def_elo = 1500


def expected_score(rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))
    
def calculate_elo(matches: pd.DataFrame, k = 32) -> pd.DataFrame:
    players = pd.concat([matches['Player_1'], matches['Player_2']]).unique()
    elo_ratings = {player: def_elo for player in players}
    for index, row in matches.iterrows():
        player1 = row['Player_1']
        player2 = row['Player_2']
        winner = row['Winner']
        
        rating1 = elo_ratings[player1]
        rating2 = elo_ratings[player2]
        
        expected1 = expected_score(rating1, rating2)
        expected2 = expected_score(rating2, rating1)
        
        if winner == player1:
            elo_ratings[player1] = rating1 + k * (1 - expected1)
            elo_ratings[player2] = rating2 + k * (0 - expected2)
        else:
            elo_ratings[player1] = rating1 + k * (0 - expected1)
            elo_ratings[player2] = rating2 + k * (1 - expected2)

    data = pd.DataFrame(list(elo_ratings.items()), columns=['PlayerName', 'Elo'])
    data = data.sort_values(by='PlayerName')
    data.reset_index(inplace=True, drop=True)
    return data


elos = []
surfaces = data_raw['Surface'].unique()
courts = data_raw['Court'].unique()
for court in courts:
    for surface in surfaces:
        data = data_raw[(data_raw['Court'] == court) & (data_raw['Surface'] == surface)]
        elo = calculate_elo(data)
        elos.append([court + surface, elo])
        
elos.append(['All', calculate_elo(data_raw)])
        
data_elo = elos[0][1]['PlayerName']

for row in elos:
    col_name = row[0]
    elo = row[1]
    data_elo = pd.merge(data_elo, elo, on='PlayerName', how='left')
    data_elo.rename(columns={'Elo': 'Elo' + col_name}, inplace=True)

data_elo.fillna(def_elo, inplace=True)

print(data_elo.head())
data_elo.to_csv('./../data/data_elo.csv', index=True)


In [7]:
# # HEAD to HEAD

head_to_head_dict = {}

for index, row in data_raw.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    winner = row['Winner']
    if player1 not in head_to_head_dict:
        head_to_head_dict[player1] = {}
    if player2 not in head_to_head_dict:
        head_to_head_dict[player2] = {}
    if player2 not in head_to_head_dict[player1]:
        head_to_head_dict[player1][player2] = 0
    if player1 not in head_to_head_dict[player2]:
        head_to_head_dict[player2][player1] = 0
    if winner == player1:
        head_to_head_dict[player1][player2] += 1
    else:
        head_to_head_dict[player2][player1] += 1
        


data_head_to_head = pd.DataFrame(0, index=data_players['PlayerName'], columns=data_players['PlayerName'])

for index, row in data_head_to_head.iterrows():
    for column in data_head_to_head.columns:
        if index == column:
            continue
        else:
            if index in head_to_head_dict and column in head_to_head_dict[index]:
                data_head_to_head.at[index, column] = head_to_head_dict[index][column]
            
print(data_head_to_head.head())
data_head_to_head.to_csv('./../data/data_head_to_head.csv', index=True)

PlayerName      Dosedel S.  Clement A.  Escude N.  Knippschild J.  \
PlayerName                                                          
Dosedel S.               0           0          1               0   
Clement A.               0           0          1               1   
Escude N.                2           1          0               0   
Knippschild J.           0           0          0               0   
Fromberg R.              0           1          0               0   

PlayerName      Fromberg R.  Arthurs W.  Grosjean S.  Balcells J.  Hewitt L.  \
PlayerName                                                                     
Dosedel S.                0           0            0            0          0   
Clement A.                0           2            3            1          1   
Escude N.                 0           2            2            1          1   
Knippschild J.            0           0            0            0          0   
Fromberg R.               0         

In [8]:
# matches in last 7 and 30 days
data_raw['Date'] = pd.to_datetime(data_raw['Date'], errors='coerce')

today = pd.Timestamp.today()
data_tmp = data_raw.dropna(subset=['Date'])
data_tmp30 = data_tmp[today - data_tmp['Date'] <= pd.Timedelta(days=30)]
data_tmp7 = data_tmp30[today - data_tmp30['Date'] <= pd.Timedelta(days=7)]

last_30 = {}
last_7 = {}

for index, row in data_tmp30.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_30:
        last_30[player1] = 0
    if player2 not in last_30:
        last_30[player2] = 0
    last_30[player1] += 1
    last_30[player2] += 1
    
for index, row in data_tmp7.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_7:
        last_7[player1] = 0
    if player2 not in last_7:
        last_7[player2] = 0
    last_7[player1] += 1
    last_7[player2] += 1
    
data_last_matches = pd.DataFrame({
    'PlayerName': data_players['PlayerName'],
    'Last30': [last_30[player] if player in last_30 else 0 for player in data_players['PlayerName']],
    'Last7': [last_7[player] if player in last_7 else 0 for player in data_players['PlayerName']]
})

print(data_last_matches.head())
data_last_matches.to_csv('./../data/data_last_matches.csv', index=True)

       PlayerName  Last30  Last7
0      Dosedel S.       0      0
1      Clement A.       0      0
2       Escude N.       0      0
3  Knippschild J.       0      0
4     Fromberg R.       0      0


In [9]:
# odds

df = data_raw[['Player_1', 'Player_2', 'Winner', 'Odd_1', 'Odd_2']][today - data_raw['Date'] <= pd.Timedelta(days=365)].copy()

df['Odd_1'] = pd.to_numeric(df['Odd_1'], errors='coerce')
df['Odd_2'] = pd.to_numeric(df['Odd_2'], errors='coerce')
df = df.dropna(subset=['Odd_1', 'Odd_2'])

df["Imp_Prob_1"] = 1 / df["Odd_1"]
df["Imp_Prob_2"] = 1 / df["Odd_2"]

df_long = pd.melt(df, id_vars=["Winner", "Imp_Prob_1", "Imp_Prob_2"],
                  value_vars=["Player_1", "Player_2"],
                  var_name="Role", value_name="PlayerName")

df_long["Win"] = df_long["Winner"] == df_long["PlayerName"]
df_long["Prob"] = [row['Imp_Prob_1'] if row['Role'] == 'Player_1' else row['Imp_Prob_2'] for index, row in df_long.iterrows()]

df_long = df_long[['PlayerName', 'Prob', 'Win']]

data_player_odds = df_long.groupby('PlayerName').agg(
    AvgProb=('Prob', 'mean'),
    MatchesPlayed=('Win', 'count'),
).reset_index()

print(data_player_odds)
data_player_odds.to_csv('./../data/data_player_odds.csv', index=True)


       PlayerName   AvgProb  MatchesPlayed
0    Ajdukovic D.  0.364719              8
1        Albot R.  0.347389              7
2      Alcaraz C.  0.875601             58
3     Altmaier D.  0.408571             34
4    Andreozzi G.  0.285714              1
..            ...       ...            ...
287   Zeppieri G.  0.586486              4
288     Zhang Zh.  0.484629             43
289       Zhou Y.  0.151282              3
290   Zhukayev B.  0.348148              3
291     Zverev A.  0.798387             79

[292 rows x 3 columns]
