In [48]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import shutil
import random

source_path = kagglehub.dataset_download("dissfya/atp-tennis-2000-2023daily-pull")
destination_path = "./../data"
shutil.copytree(source_path, destination_path, dirs_exist_ok=True)
data_raw = pd.read_csv(destination_path + '/atp_tennis.csv')

# source_path = kagglehub.dataset_download("dissfya/wta-tennis-2007-2023-daily-update")
# destination_path = "./../data"
# shutil.copytree(source_path, destination_path, dirs_exist_ok=True)
# data_raw = pd.read_csv(destination_path + '/wta.csv')

last_day = data_raw['Date'].max()
print(f'Last day in the dataset: {last_day}')
data_raw.to_csv("./../data/data_raw.csv", index=False)

#!
today_date = '2025-01-01'
# today_date = last_day
data_for_prediction = data_raw[data_raw['Date'] >=  today_date]
data_for_prediction.to_csv("./../data/data_for_testing.csv", index=False)

#!
data_raw = data_raw[data_raw['Date'] < today_date]
data_raw.to_csv("./../data/data_for_training.csv", index=False)

print(data_raw.head())
print(len(data_raw))


Last day in the dataset: 2025-03-16
                           Tournament        Date         Series    Court  \
0  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
1  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
2  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
3  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
4  Australian Hardcourt Championships  2000-01-03  International  Outdoor   

  Surface      Round  Best of        Player_1       Player_2       Winner  \
0    Hard  1st Round        3      Dosedel S.    Ljubicic I.   Dosedel S.   
1    Hard  1st Round        3      Clement A.     Enqvist T.   Enqvist T.   
2    Hard  1st Round        3       Escude N.  Baccanello P.    Escude N.   
3    Hard  1st Round        3  Knippschild J.     Federer R.   Federer R.   
4    Hard  1st Round        3     Fromberg R.  Woodbridge T.  Fromberg R.   

   Rank_1  Rank_2  Pts_1  Pts_2  Odd_1

In [49]:
# all players

data_players = pd.DataFrame(pd.concat([data_raw['Player_1'], data_raw['Player_2']]).unique(), columns=['PlayerName'])
print(data_players.head())
data_players.to_csv('./../data/data_players.csv', index=True)

       PlayerName
0      Dosedel S.
1      Clement A.
2       Escude N.
3  Knippschild J.
4     Fromberg R.


In [50]:
# ELO calculation
def_elo = 1500


def expected_score(rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))
    
def calculate_elo(matches: pd.DataFrame, k = 32) -> pd.DataFrame:
    players = pd.concat([matches['Player_1'], matches['Player_2']]).unique()
    elo_ratings = {player: def_elo for player in players}
    for index, row in matches.iterrows():
        player1 = row['Player_1']
        player2 = row['Player_2']
        winner = row['Winner']
        
        rating1 = elo_ratings[player1]
        rating2 = elo_ratings[player2]
        
        expected1 = expected_score(rating1, rating2)
        expected2 = expected_score(rating2, rating1)
        
        if winner == player1:
            elo_ratings[player1] = rating1 + k * (1 - expected1)
            elo_ratings[player2] = rating2 + k * (0 - expected2)
        else:
            elo_ratings[player1] = rating1 + k * (0 - expected1)
            elo_ratings[player2] = rating2 + k * (1 - expected2)

    data = pd.DataFrame(list(elo_ratings.items()), columns=['PlayerName', 'Elo'])
    data = data.sort_values(by='PlayerName')
    data.reset_index(inplace=True, drop=True)
    return data


elos = []
surfaces = data_raw['Surface'].unique()
courts = data_raw['Court'].unique()
for court in courts:
    data = data_raw[(data_raw['Court'] == court)]
    elo = calculate_elo(data)
    elos.append([court, elo])
    
for surface in surfaces:
    data = data_raw[(data_raw['Surface'] == surface)]
    elo = calculate_elo(data)
    elos.append([surface, elo])
        
data_elo = elos[0][1]['PlayerName']

for row in elos:
    col_name = row[0]
    elo = row[1]
    data_elo = pd.merge(data_elo, elo, on='PlayerName', how='left')
    data_elo.rename(columns={'Elo': 'Elo' + col_name}, inplace=True)

data_elo.fillna(def_elo, inplace=True)

print(data_elo.head())
data_elo.to_csv('./../data/data_elo.csv', index=True)


      PlayerName   EloOutdoor    EloIndoor      EloHard      EloClay  \
0       Hajek J.  1507.157068  1500.000000  1500.000000  1504.890330   
1     Abdulla M.  1488.300389  1500.000000  1484.729199  1500.000000   
2        Abel M.  1439.261836  1500.000000  1453.861729  1486.483151   
3     Acasuso J.  1678.150091  1408.922091  1551.317914  1677.978440   
4  Adaktusson J.  1489.039221  1500.000000  1487.391930  1500.000000   

      EloGrass    EloCarpet  
0  1500.000000  1500.000000  
1  1500.000000  1500.000000  
2  1489.389761  1500.000000  
3  1395.062325  1444.174528  
4  1500.000000  1500.000000  


In [51]:
# # HEAD to HEAD

head_to_head_dict = {}

for index, row in data_raw.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    winner = row['Winner']
    if player1 not in head_to_head_dict:
        head_to_head_dict[player1] = {}
    if player2 not in head_to_head_dict:
        head_to_head_dict[player2] = {}
    if player2 not in head_to_head_dict[player1]:
        head_to_head_dict[player1][player2] = 0
    if player1 not in head_to_head_dict[player2]:
        head_to_head_dict[player2][player1] = 0
    if winner == player1:
        head_to_head_dict[player1][player2] += 1
    else:
        head_to_head_dict[player2][player1] += 1
        


data_head_to_head = pd.DataFrame(0, index=data_players['PlayerName'], columns=data_players['PlayerName'])

for index, row in data_head_to_head.iterrows():
    for column in data_head_to_head.columns:
        if index == column:
            continue
        else:
            if index in head_to_head_dict and column in head_to_head_dict[index]:
                data_head_to_head.at[index, column] = head_to_head_dict[index][column]
            
print(data_head_to_head.head())
data_head_to_head.to_csv('./../data/data_head_to_head.csv', index=True)

PlayerName      Dosedel S.  Clement A.  Escude N.  Knippschild J.  \
PlayerName                                                          
Dosedel S.               0           0          1               0   
Clement A.               0           0          1               1   
Escude N.                2           1          0               0   
Knippschild J.           0           0          0               0   
Fromberg R.              0           1          0               0   

PlayerName      Fromberg R.  Arthurs W.  Grosjean S.  Balcells J.  Hewitt L.  \
PlayerName                                                                     
Dosedel S.                0           0            0            0          0   
Clement A.                0           2            3            1          1   
Escude N.                 0           2            2            1          1   
Knippschild J.            0           0            0            0          0   
Fromberg R.               0         

In [52]:
# matches in last 7 and 30 days
import datetime


data_raw['Date'] = pd.to_datetime(data_raw['Date'], errors='coerce')

today = datetime.datetime.strptime(today_date, '%Y-%m-%d')
data_tmp = data_raw.dropna(subset=['Date'])
data_tmp30 = data_tmp[today - data_tmp['Date'] <= pd.Timedelta(days=30)]
data_tmp7 = data_tmp30[today - data_tmp30['Date'] <= pd.Timedelta(days=7)]

last_30 = {}
last_7 = {}

for index, row in data_tmp30.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_30:
        last_30[player1] = 0
    if player2 not in last_30:
        last_30[player2] = 0
    last_30[player1] += 1
    last_30[player2] += 1
    
for index, row in data_tmp7.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_7:
        last_7[player1] = 0
    if player2 not in last_7:
        last_7[player2] = 0
    last_7[player1] += 1
    last_7[player2] += 1
    
data_last_matches = pd.DataFrame({
    'PlayerName': data_players['PlayerName'],
    'Last30': [last_30[player] if player in last_30 else 0 for player in data_players['PlayerName']],
    'Last7': [last_7[player] if player in last_7 else 0 for player in data_players['PlayerName']]
})

print(data_last_matches.head())
data_last_matches.to_csv('./../data/data_last_matches.csv', index=True)

       PlayerName  Last30  Last7
0      Dosedel S.       0      0
1      Clement A.       0      0
2       Escude N.       0      0
3  Knippschild J.       0      0
4     Fromberg R.       0      0


In [53]:
# odds

df = data_raw[['Player_1', 'Player_2', 'Winner', 'Odd_1', 'Odd_2']][today - data_raw['Date'] <= pd.Timedelta(days=365)].copy()

df['Odd_1'] = pd.to_numeric(df['Odd_1'], errors='coerce')
df['Odd_2'] = pd.to_numeric(df['Odd_2'], errors='coerce')
df = df.dropna(subset=['Odd_1', 'Odd_2'])

df["Imp_Prob_1"] = 1 / df["Odd_1"]
df["Imp_Prob_2"] = 1 / df["Odd_2"]

df_long = pd.melt(df, id_vars=["Winner", "Imp_Prob_1", "Imp_Prob_2"],
                  value_vars=["Player_1", "Player_2"],
                  var_name="Role", value_name="PlayerName")

df_long["Win"] = df_long["Winner"] == df_long["PlayerName"]
df_long["Prob"] = [row['Imp_Prob_1'] if row['Role'] == 'Player_1' else row['Imp_Prob_2'] for index, row in df_long.iterrows()]

df_long = df_long[['PlayerName', 'Prob', 'Win']]

data_player_odds = df_long.groupby('PlayerName').agg(
    AvgOddsProb=('Prob', 'mean'),
).reset_index()



print(data_player_odds)
data_player_odds.to_csv('./../data/data_player_odds.csv', index=True)


       PlayerName  AvgOddsProb
0    Ajdukovic D.     0.364719
1        Albot R.     0.365077
2      Alcaraz C.     0.865758
3     Altmaier D.     0.441941
4    Andreozzi G.     0.285714
..            ...          ...
284   Zeppieri G.     0.284427
285     Zhang Zh.     0.468727
286       Zhou Y.     0.151282
287   Zhukayev B.     0.348148
288     Zverev A.     0.780127

[289 rows x 2 columns]


In [54]:
# Aggregated data

import numpy as np

data = pd.merge(data_players, data_elo, on='PlayerName', how='left')
data = pd.merge(data, data_last_matches, on='PlayerName', how='left')
data = pd.merge(data, data_player_odds, on='PlayerName', how='left')

all_elo_types = np.concatenate((data_raw['Surface'].unique(), data_raw['Court'].unique()))
for elo_col in all_elo_types:
    elo_col = 'Elo' + elo_col
    data[elo_col] = data[elo_col].fillna(def_elo)
    
data['Last30'] = data['Last30'].fillna(0)
data['Last7'] = data['Last7'].fillna(0)

data["AvgOddsProb"] = data["AvgOddsProb"].fillna(0)


print(data)

data.to_csv('./../data/data_combined.csv', index=False)


          PlayerName   EloOutdoor    EloIndoor      EloHard      EloClay  \
0         Dosedel S.  1424.417920  1441.871260  1437.329452  1428.730470   
1         Clement A.  1520.985499  1478.616951  1516.515622  1464.921890   
2          Escude N.  1714.102239  1660.715871  1730.433793  1479.273914   
3     Knippschild J.  1445.548859  1448.510527  1427.102951  1441.495186   
4        Fromberg R.  1398.104593  1486.806130  1484.510330  1395.090719   
...              ...          ...          ...          ...          ...   
1718  Kasnikowski M.  1485.776376  1500.000000  1483.188056  1500.000000   
1719      Jacquet K.  1496.441550  1500.000000  1496.949376  1500.000000   
1720          Sun F.  1487.114432  1500.000000  1484.089900  1500.000000   
1721    Collignon R.  1500.000000  1500.000000  1500.000000  1500.000000   
1722  Papamalamis T.  1500.000000  1500.000000  1500.000000  1500.000000   

         EloGrass    EloCarpet  Last30  Last7  AvgOddsProb  
0     1507.382300  1484.00