In [9]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import shutil


source_path = kagglehub.dataset_download("dissfya/wta-tennis-2007-2023-daily-update")


destination_path = "./../data"


shutil.copytree(source_path, destination_path, dirs_exist_ok=True)


data_raw = pd.read_csv(destination_path + '/wta.csv')
print(data_raw.columns)
print(data_raw)

Index(['Tournament', 'Date', 'Court', 'Surface', 'Round', 'Best of',
       'Player_1', 'Player_2', 'Winner', 'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2',
       'Odd_1', 'Odd_2', 'Score'],
      dtype='object')
             Tournament                 Date    Court Surface          Round  \
0           ASB Classic  2007-01-01 00:00:00  Outdoor    Hard      1st Round   
1           ASB Classic  2007-01-01 00:00:00  Outdoor    Hard      1st Round   
2           ASB Classic  2007-01-01 00:00:00  Outdoor    Hard      1st Round   
3           ASB Classic  2007-01-01 00:00:00  Outdoor    Hard      1st Round   
4           ASB Classic  2007-01-01 00:00:00  Outdoor    Hard      1st Round   
...                 ...                  ...      ...     ...            ...   
41726  BNP Paribas Open  2025-03-13 00:00:00  Outdoor    Hard  Quarterfinals   
41727  BNP Paribas Open  2025-03-14 00:00:00  Outdoor    Hard  Quarterfinals   
41728  BNP Paribas Open  2025-03-14 00:00:00  Outdoor    Hard     Semifinal

  data_raw = pd.read_csv(destination_path + '/wta.csv')


In [10]:
# all players

data_players = pd.DataFrame(pd.concat([data_raw['Player_1'], data_raw['Player_2']]).unique(), columns=['PlayerName'])
print(data_players.head())
data_players.to_csv('./../data/data_players.csv', index=True)

    PlayerName
0     Sun T.T.
1   Myskina A.
2      Loit E.
3  Nakamura A.
4   Bartoli M.


In [11]:
# ELO calculation
def_elo = 1500


def expected_score(rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))
    
def calculate_elo(matches: pd.DataFrame, k = 32) -> pd.DataFrame:
    players = pd.concat([matches['Player_1'], matches['Player_2']]).unique()
    elo_ratings = {player: def_elo for player in players}
    for index, row in matches.iterrows():
        player1 = row['Player_1']
        player2 = row['Player_2']
        winner = row['Winner']
        
        rating1 = elo_ratings[player1]
        rating2 = elo_ratings[player2]
        
        expected1 = expected_score(rating1, rating2)
        expected2 = expected_score(rating2, rating1)
        
        if winner == player1:
            elo_ratings[player1] = rating1 + k * (1 - expected1)
            elo_ratings[player2] = rating2 + k * (0 - expected2)
        else:
            elo_ratings[player1] = rating1 + k * (0 - expected1)
            elo_ratings[player2] = rating2 + k * (1 - expected2)

    data = pd.DataFrame(list(elo_ratings.items()), columns=['PlayerName', 'Elo'])
    data = data.sort_values(by='PlayerName')
    data.reset_index(inplace=True, drop=True)
    return data


elos = []
surfaces = data_raw['Surface'].unique()
courts = data_raw['Court'].unique()
for court in courts:
    for surface in surfaces:
        data = data_raw[(data_raw['Court'] == court) & (data_raw['Surface'] == surface)]
        elo = calculate_elo(data)
        elos.append([court + surface, elo])
        
elos.append(['All', calculate_elo(data_raw)])
        
data_elo = elos[0][1]['PlayerName']

for row in elos:
    col_name = row[0]
    elo = row[1]
    data_elo = pd.merge(data_elo, elo, on='PlayerName', how='left')
    data_elo.rename(columns={'Elo': 'Elo' + col_name}, inplace=True)

data_elo.fillna(def_elo, inplace=True)

print(data_elo.head())
data_elo.to_csv('./../data/data_elo.csv', index=True)


       PlayerName  EloOutdoorHard  EloOutdoorCarpet  EloOutdoorClay  \
0       Abanda F.     1475.098186              1500     1506.014819   
1  Abduraimova N.     1422.805703              1500     1481.022004   
2    Abramovic M.     1499.735257              1500     1500.000000   
3     Adamczak M.     1433.445692              1500     1501.595078   
4          Ahn K.     1528.219816              1500     1483.168280   

   EloOutdoorGrass  EloOutdoorGreenset  EloIndoorHard  EloIndoorCarpet  \
0      1498.918468                1500    1512.437816           1500.0   
1      1500.000000                1500    1500.000000           1500.0   
2      1500.000000                1500    1500.000000           1500.0   
3      1500.000000                1500    1484.141129           1500.0   
4      1504.843360                1500    1466.928197           1500.0   

   EloIndoorClay  EloIndoorGrass  EloIndoorGreenset  EloClayHard  \
0         1500.0            1500             1500.0       15

  data_elo.fillna(def_elo, inplace=True)


In [12]:
# # HEAD to HEAD

head_to_head_dict = {}

for index, row in data_raw.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    winner = row['Winner']
    if player1 not in head_to_head_dict:
        head_to_head_dict[player1] = {}
    if player2 not in head_to_head_dict:
        head_to_head_dict[player2] = {}
    if player2 not in head_to_head_dict[player1]:
        head_to_head_dict[player1][player2] = 0
    if player1 not in head_to_head_dict[player2]:
        head_to_head_dict[player2][player1] = 0
    if winner == player1:
        head_to_head_dict[player1][player2] += 1
    else:
        head_to_head_dict[player2][player1] += 1
        


data_head_to_head = pd.DataFrame(0, index=data_players['PlayerName'], columns=data_players['PlayerName'])

for index, row in data_head_to_head.iterrows():
    for column in data_head_to_head.columns:
        if index == column:
            continue
        else:
            if index in head_to_head_dict and column in head_to_head_dict[index]:
                data_head_to_head.at[index, column] = head_to_head_dict[index][column]
            
print(data_head_to_head.head())
data_head_to_head.to_csv('./../data/data_head_to_head.csv', index=True)

PlayerName   Sun T.T.  Myskina A.  Loit E.  Nakamura A.  Bartoli M.  Perry S.  \
PlayerName                                                                      
Sun T.T.            0           0        0            0           0         0   
Myskina A.          0           0        0            0           0         0   
Loit E.             2           0        0            0           1         0   
Nakamura A.         0           0        0            0           0         0   
Bartoli M.          0           0        0            1           0         1   

PlayerName   Daniilidou E.  Tu M.  Razzano V.  Johansson M.  ...  Avdeeva J.  \
PlayerName                                                   ...               
Sun T.T.                 0      0           0             0  ...           0   
Myskina A.               0      0           0             0  ...           0   
Loit E.                  1      0           0             0  ...           0   
Nakamura A.              1      