In [48]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import shutil
import random

source_path = kagglehub.dataset_download("dissfya/atp-tennis-2000-2023daily-pull")
destination_path = "./../data"
shutil.copytree(source_path, destination_path, dirs_exist_ok=True)
data_raw = pd.read_csv(destination_path + '/atp_tennis.csv')

# source_path = kagglehub.dataset_download("dissfya/wta-tennis-2007-2023-daily-update")
# destination_path = "./../data"
# shutil.copytree(source_path, destination_path, dirs_exist_ok=True)
# data_raw = pd.read_csv(destination_path + '/wta.csv')

last_day = data_raw['Date'].max()
print(f'Last day in the dataset: {last_day}')
data_raw.to_csv("./../data/data_raw.csv", index=False)

#!
today_date = '2025-01-01'
# today_date = last_day
data_for_prediction = data_raw[data_raw['Date'] >=  today_date]
data_for_prediction.to_csv("./../data/data_for_testing.csv", index=False)

#!
data_raw = data_raw[data_raw['Date'] < today_date]
data_raw.to_csv("./../data/data_for_training.csv", index=False)

print(data_raw.head())
print(len(data_raw))


Last day in the dataset: 2025-03-16
                           Tournament        Date         Series    Court  \
0  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
1  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
2  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
3  Australian Hardcourt Championships  2000-01-03  International  Outdoor   
4  Australian Hardcourt Championships  2000-01-03  International  Outdoor   

  Surface      Round  Best of        Player_1       Player_2       Winner  \
0    Hard  1st Round        3      Dosedel S.    Ljubicic I.   Dosedel S.   
1    Hard  1st Round        3      Clement A.     Enqvist T.   Enqvist T.   
2    Hard  1st Round        3       Escude N.  Baccanello P.    Escude N.   
3    Hard  1st Round        3  Knippschild J.     Federer R.   Federer R.   
4    Hard  1st Round        3     Fromberg R.  Woodbridge T.  Fromberg R.   

   Rank_1  Rank_2  Pts_1  Pts_2  Odd_1

In [49]:
# all players

data_players = pd.DataFrame(pd.concat([data_raw['Player_1'], data_raw['Player_2']]).unique(), columns=['PlayerName'])
print(data_players.head())
data_players.to_csv('./../data/data_players.csv', index=True)

       PlayerName
0      Dosedel S.
1      Clement A.
2       Escude N.
3  Knippschild J.
4     Fromberg R.


In [50]:
# ELO calculation
def_elo = 1500


def expected_score(rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))
    
def calculate_elo(matches: pd.DataFrame, k = 32) -> pd.DataFrame:
    players = pd.concat([matches['Player_1'], matches['Player_2']]).unique()
    elo_ratings = {player: def_elo for player in players}
    for index, row in matches.iterrows():
        player1 = row['Player_1']
        player2 = row['Player_2']
        winner = row['Winner']
        
        rating1 = elo_ratings[player1]
        rating2 = elo_ratings[player2]
        
        expected1 = expected_score(rating1, rating2)
        expected2 = expected_score(rating2, rating1)
        
        if winner == player1:
            elo_ratings[player1] = rating1 + k * (1 - expected1)
            elo_ratings[player2] = rating2 + k * (0 - expected2)
        else:
            elo_ratings[player1] = rating1 + k * (0 - expected1)
            elo_ratings[player2] = rating2 + k * (1 - expected2)

    data = pd.DataFrame(list(elo_ratings.items()), columns=['PlayerName', 'Elo'])
    data = data.sort_values(by='PlayerName')
    data.reset_index(inplace=True, drop=True)
    return data


elos = []
surfaces = data_raw['Surface'].unique()
courts = data_raw['Court'].unique()
for court in courts:
    data = data_raw[(data_raw['Court'] == court)]
    elo = calculate_elo(data)
    elos.append([court, elo])
    
for surface in surfaces:
    data = data_raw[(data_raw['Surface'] == surface)]
    elo = calculate_elo(data)
    elos.append([surface, elo])
        
data_elo = elos[0][1]['PlayerName']

for row in elos:
    col_name = row[0]
    elo = row[1]
    data_elo = pd.merge(data_elo, elo, on='PlayerName', how='left')
    data_elo.rename(columns={'Elo': 'Elo' + col_name}, inplace=True)

data_elo.fillna(def_elo, inplace=True)

print(data_elo.head())
data_elo.to_csv('./../data/data_elo.csv', index=True)


      PlayerName   EloOutdoor    EloIndoor      EloHard      EloClay  \
0       Hajek J.  1507.157068  1500.000000  1500.000000  1504.890330   
1     Abdulla M.  1488.300389  1500.000000  1484.729199  1500.000000   
2        Abel M.  1439.261836  1500.000000  1453.861729  1486.483151   
3     Acasuso J.  1678.150091  1408.922091  1551.317914  1677.978440   
4  Adaktusson J.  1489.039221  1500.000000  1487.391930  1500.000000   

      EloGrass    EloCarpet  
0  1500.000000  1500.000000  
1  1500.000000  1500.000000  
2  1489.389761  1500.000000  
3  1395.062325  1444.174528  
4  1500.000000  1500.000000  


In [None]:
# # HEAD to HEAD

head_to_head_dict = {}

for index, row in data_raw.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    winner = row['Winner']
    if player1 not in head_to_head_dict:
        head_to_head_dict[player1] = {}
    if player2 not in head_to_head_dict:
        head_to_head_dict[player2] = {}
    if player2 not in head_to_head_dict[player1]:
        head_to_head_dict[player1][player2] = 0
    if player1 not in head_to_head_dict[player2]:
        head_to_head_dict[player2][player1] = 0
    if winner == player1:
        head_to_head_dict[player1][player2] += 1
    else:
        head_to_head_dict[player2][player1] += 1
        


data_head_to_head = pd.DataFrame(0, index=data_players['PlayerName'], columns=data_players['PlayerName'])

for index, row in data_head_to_head.iterrows():
    for column in data_head_to_head.columns:
        if index == column:
            continue
        else:
            if index in head_to_head_dict and column in head_to_head_dict[index]:
                data_head_to_head.at[index, column] = head_to_head_dict[index][column]
            
print(data_head_to_head.head())
data_head_to_head.to_csv('./../data/data_head_to_head.csv', index=True)

In [None]:
# matches in last 7 and 30 days
import datetime


data_raw['Date'] = pd.to_datetime(data_raw['Date'], errors='coerce')

today = datetime.datetime.strptime(today_date, '%Y-%m-%d')
data_tmp = data_raw.dropna(subset=['Date'])
data_tmp30 = data_tmp[today - data_tmp['Date'] <= pd.Timedelta(days=30)]
data_tmp7 = data_tmp30[today - data_tmp30['Date'] <= pd.Timedelta(days=7)]

last_30 = {}
last_7 = {}

for index, row in data_tmp30.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_30:
        last_30[player1] = 0
    if player2 not in last_30:
        last_30[player2] = 0
    last_30[player1] += 1
    last_30[player2] += 1
    
for index, row in data_tmp7.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_7:
        last_7[player1] = 0
    if player2 not in last_7:
        last_7[player2] = 0
    last_7[player1] += 1
    last_7[player2] += 1
    
data_last_matches = pd.DataFrame({
    'PlayerName': data_players['PlayerName'],
    'Last30': [last_30[player] if player in last_30 else 0 for player in data_players['PlayerName']],
    'Last7': [last_7[player] if player in last_7 else 0 for player in data_players['PlayerName']]
})

print(data_last_matches.head())
data_last_matches.to_csv('./../data/data_last_matches.csv', index=True)

    PlayerName  Last30  Last7
0     Sun T.T.       0      0
1   Myskina A.       0      0
2      Loit E.       0      0
3  Nakamura A.       0      0
4   Bartoli M.       0      0


In [None]:
# odds

df = data_raw[['Player_1', 'Player_2', 'Winner', 'Odd_1', 'Odd_2']][today - data_raw['Date'] <= pd.Timedelta(days=365)].copy()

df['Odd_1'] = pd.to_numeric(df['Odd_1'], errors='coerce')
df['Odd_2'] = pd.to_numeric(df['Odd_2'], errors='coerce')
df = df.dropna(subset=['Odd_1', 'Odd_2'])

df["Imp_Prob_1"] = 1 / df["Odd_1"]
df["Imp_Prob_2"] = 1 / df["Odd_2"]

df_long = pd.melt(df, id_vars=["Winner", "Imp_Prob_1", "Imp_Prob_2"],
                  value_vars=["Player_1", "Player_2"],
                  var_name="Role", value_name="PlayerName")

df_long["Win"] = df_long["Winner"] == df_long["PlayerName"]
df_long["Prob"] = [row['Imp_Prob_1'] if row['Role'] == 'Player_1' else row['Imp_Prob_2'] for index, row in df_long.iterrows()]

df_long = df_long[['PlayerName', 'Prob', 'Win']]

data_player_odds = df_long.groupby('PlayerName').agg(
    AvgOddsProb=('Prob', 'mean'),
).reset_index()



print(data_player_odds)
data_player_odds.to_csv('./../data/data_player_odds.csv', index=True)


         PlayerName  AvgOddsProb
0          Aiava D.     0.066667
1    Alexandrova E.     0.582423
2      Andreescu B.     0.577515
3       Andreeva E.     0.461536
4       Andreeva M.     0.715804
..              ...          ...
297        Zheng Q.     0.704958
298        Zheng S.     0.162085
299        Zheng W.     0.380228
300          Zhu L.     0.488628
301     Zidansek T.     0.396299

[302 rows x 2 columns]


In [None]:
# Aggregated data

import numpy as np

data = pd.merge(data_players, data_elo, on='PlayerName', how='left')
data = pd.merge(data, data_last_matches, on='PlayerName', how='left')
data = pd.merge(data, data_player_odds, on='PlayerName', how='left')

all_elo_types = np.concatenate((data_raw['Surface'].unique(), data_raw['Court'].unique()))
for elo_col in all_elo_types:
    elo_col = 'Elo' + elo_col
    data[elo_col] = data[elo_col].fillna(def_elo)
    
data['Last30'] = data['Last30'].fillna(0)
data['Last7'] = data['Last7'].fillna(0)

data["AvgOddsProb"] = data["AvgOddsProb"].fillna(0)


print(data)

data.to_csv('./../data/data_combined.csv', index=False)


            PlayerName   EloOutdoor    EloIndoor  EloClay      EloHard  \
0             Sun T.T.  1352.750720  1500.000000   1500.0  1411.617147   
1           Myskina A.  1466.900611  1500.000000   1500.0  1484.000000   
2              Loit E.  1438.998973  1488.482855   1500.0  1444.645362   
3          Nakamura A.  1332.782209  1484.000000   1500.0  1325.313621   
4           Bartoli M.  1775.167621  1590.712580   1500.0  1709.439519   
...                ...          ...          ...      ...          ...   
1222     Jamrichova R.  1486.222192  1500.000000   1484.0  1467.449583   
1223           Noel A.  1489.693059  1500.000000   1500.0  1487.190036   
1224          Falei A.  1486.799579  1500.000000   1500.0  1484.072037   
1225            Ren Y.  1483.635952  1500.000000   1500.0  1482.606995   
1226  Stojsavljevic M.  1481.319995  1500.000000   1500.0  1480.503658   

        EloCarpet      EloClay     EloGrass  EloGreenset  Last30  Last7  \
0     1500.000000  1427.790344  1468