In [47]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import shutil


source_path = kagglehub.dataset_download("dissfya/wta-tennis-2007-2023-daily-update")


destination_path = "./../data"


shutil.copytree(source_path, destination_path, dirs_exist_ok=True)


data_raw = pd.read_csv(destination_path + '/wta.csv')
print(data_raw.head())
print(data_raw.dtypes)

    Tournament                 Date    Court Surface      Round  Best of  \
0  ASB Classic  2007-01-01 00:00:00  Outdoor    Hard  1st Round        3   
1  ASB Classic  2007-01-01 00:00:00  Outdoor    Hard  1st Round        3   
2  ASB Classic  2007-01-01 00:00:00  Outdoor    Hard  1st Round        3   
3  ASB Classic  2007-01-01 00:00:00  Outdoor    Hard  1st Round        3   
4  ASB Classic  2007-01-01 00:00:00  Outdoor    Hard  1st Round        3   

      Player_1      Player_2      Winner  Rank_1  Rank_2  Pts_1  Pts_2 Odd_1  \
0     Sun T.T.      Baker L.    Sun T.T.      81     272    332     90  1.33   
1   Myskina A.      Dulko G.    Dulko G.      16      59   1000    401  1.22   
2      Loit E.  Birnerova E.     Loit E.      56      84    418    324  1.72   
3  Nakamura A.    Craybas J.  Craybas J.      57      70    405    365  1.83   
4   Bartoli M.     Morita A.  Bartoli M.      18     180    951    152  1.16   

   Odd_2        Score  
0   3.00     6-1 6-1   
1   3.75  1-6 

  data_raw = pd.read_csv(destination_path + '/wta.csv')


In [48]:
# all players

data_players = pd.DataFrame(pd.concat([data_raw['Player_1'], data_raw['Player_2']]).unique(), columns=['PlayerName'])
print(data_players.head())
data_players.to_csv('./../data/data_players.csv', index=True)

    PlayerName
0     Sun T.T.
1   Myskina A.
2      Loit E.
3  Nakamura A.
4   Bartoli M.


In [49]:
# ELO calculation
def_elo = 1500


def expected_score(rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))
    
def calculate_elo(matches: pd.DataFrame, k = 32) -> pd.DataFrame:
    players = pd.concat([matches['Player_1'], matches['Player_2']]).unique()
    elo_ratings = {player: def_elo for player in players}
    for index, row in matches.iterrows():
        player1 = row['Player_1']
        player2 = row['Player_2']
        winner = row['Winner']
        
        rating1 = elo_ratings[player1]
        rating2 = elo_ratings[player2]
        
        expected1 = expected_score(rating1, rating2)
        expected2 = expected_score(rating2, rating1)
        
        if winner == player1:
            elo_ratings[player1] = rating1 + k * (1 - expected1)
            elo_ratings[player2] = rating2 + k * (0 - expected2)
        else:
            elo_ratings[player1] = rating1 + k * (0 - expected1)
            elo_ratings[player2] = rating2 + k * (1 - expected2)

    data = pd.DataFrame(list(elo_ratings.items()), columns=['PlayerName', 'Elo'])
    data = data.sort_values(by='PlayerName')
    data.reset_index(inplace=True, drop=True)
    return data


elos = []
surfaces = data_raw['Surface'].unique()
courts = data_raw['Court'].unique()
for court in courts:
    for surface in surfaces:
        data = data_raw[(data_raw['Court'] == court) & (data_raw['Surface'] == surface)]
        elo = calculate_elo(data)
        elos.append([court + surface, elo])
        
elos.append(['All', calculate_elo(data_raw)])
        
data_elo = elos[0][1]['PlayerName']

for row in elos:
    col_name = row[0]
    elo = row[1]
    data_elo = pd.merge(data_elo, elo, on='PlayerName', how='left')
    data_elo.rename(columns={'Elo': 'Elo' + col_name}, inplace=True)

data_elo.fillna(def_elo, inplace=True)

print(data_elo.head())
data_elo.to_csv('./../data/data_elo.csv', index=True)


       PlayerName  EloOutdoorHard  EloOutdoorCarpet  EloOutdoorClay  \
0       Abanda F.     1475.098186              1500     1506.014819   
1  Abduraimova N.     1422.805703              1500     1481.022004   
2    Abramovic M.     1499.735257              1500     1500.000000   
3     Adamczak M.     1433.445692              1500     1501.595078   
4          Ahn K.     1528.219816              1500     1483.168280   

   EloOutdoorGrass  EloOutdoorGreenset  EloIndoorHard  EloIndoorCarpet  \
0      1498.918468                1500    1512.437816           1500.0   
1      1500.000000                1500    1500.000000           1500.0   
2      1500.000000                1500    1500.000000           1500.0   
3      1500.000000                1500    1484.141129           1500.0   
4      1504.843360                1500    1466.928197           1500.0   

   EloIndoorClay  EloIndoorGrass  EloIndoorGreenset  EloClayHard  \
0         1500.0            1500             1500.0       15

  data_elo.fillna(def_elo, inplace=True)


In [50]:
# # HEAD to HEAD

head_to_head_dict = {}

for index, row in data_raw.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    winner = row['Winner']
    if player1 not in head_to_head_dict:
        head_to_head_dict[player1] = {}
    if player2 not in head_to_head_dict:
        head_to_head_dict[player2] = {}
    if player2 not in head_to_head_dict[player1]:
        head_to_head_dict[player1][player2] = 0
    if player1 not in head_to_head_dict[player2]:
        head_to_head_dict[player2][player1] = 0
    if winner == player1:
        head_to_head_dict[player1][player2] += 1
    else:
        head_to_head_dict[player2][player1] += 1
        


data_head_to_head = pd.DataFrame(0, index=data_players['PlayerName'], columns=data_players['PlayerName'])

for index, row in data_head_to_head.iterrows():
    for column in data_head_to_head.columns:
        if index == column:
            continue
        else:
            if index in head_to_head_dict and column in head_to_head_dict[index]:
                data_head_to_head.at[index, column] = head_to_head_dict[index][column]
            
print(data_head_to_head.head())
data_head_to_head.to_csv('./../data/data_head_to_head.csv', index=True)

PlayerName   Sun T.T.  Myskina A.  Loit E.  Nakamura A.  Bartoli M.  Perry S.  \
PlayerName                                                                      
Sun T.T.            0           0        0            0           0         0   
Myskina A.          0           0        0            0           0         0   
Loit E.             2           0        0            0           1         0   
Nakamura A.         0           0        0            0           0         0   
Bartoli M.          0           0        0            1           0         1   

PlayerName   Daniilidou E.  Tu M.  Razzano V.  Johansson M.  ...  Avdeeva J.  \
PlayerName                                                   ...               
Sun T.T.                 0      0           0             0  ...           0   
Myskina A.               0      0           0             0  ...           0   
Loit E.                  1      0           0             0  ...           0   
Nakamura A.              1      

In [51]:
# matches in last 7 and 30 days
data_raw['Date'] = pd.to_datetime(data_raw['Date'], errors='coerce')

today = pd.Timestamp.today()
data_tmp = data_raw.dropna(subset=['Date'])
data_tmp30 = data_tmp[today - data_tmp['Date'] <= pd.Timedelta(days=30)]
data_tmp7 = data_tmp30[today - data_tmp30['Date'] <= pd.Timedelta(days=7)]

last_30 = {}
last_7 = {}

for index, row in data_tmp30.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_30:
        last_30[player1] = 0
    if player2 not in last_30:
        last_30[player2] = 0
    last_30[player1] += 1
    last_30[player2] += 1
    
for index, row in data_tmp7.iterrows():
    player1 = row['Player_1']
    player2 = row['Player_2']
    if player1 not in last_7:
        last_7[player1] = 0
    if player2 not in last_7:
        last_7[player2] = 0
    last_7[player1] += 1
    last_7[player2] += 1
    
data_last_matches = pd.DataFrame({
    'PlayerName': data_players['PlayerName'],
    'Last30': [last_30[player] if player in last_30 else 0 for player in data_players['PlayerName']],
    'Last7': [last_7[player] if player in last_7 else 0 for player in data_players['PlayerName']]
})

print(data_last_matches.head())
data_last_matches.to_csv('./../data/data_last_matches.csv', index=True)

    PlayerName  Last30  Last7
0     Sun T.T.       0      0
1   Myskina A.       0      0
2      Loit E.       0      0
3  Nakamura A.       0      0
4   Bartoli M.       0      0


In [53]:
# odds

df = data_raw[['Player_1', 'Player_2', 'Winner', 'Odd_1', 'Odd_2']][today - data_raw['Date'] <= pd.Timedelta(days=365)].copy()

df['Odd_1'] = pd.to_numeric(df['Odd_1'], errors='coerce')
df['Odd_2'] = pd.to_numeric(df['Odd_2'], errors='coerce')
df = df.dropna(subset=['Odd_1', 'Odd_2'])

df["Imp_Prob_1"] = 1 / df["Odd_1"]
df["Imp_Prob_2"] = 1 / df["Odd_2"]

df_long = pd.melt(df, id_vars=["Winner", "Imp_Prob_1", "Imp_Prob_2"],
                  value_vars=["Player_1", "Player_2"],
                  var_name="Role", value_name="PlayerName")

df_long["Win"] = df_long["Winner"] == df_long["PlayerName"]
df_long["Prob"] = [row['Imp_Prob_1'] if row['Role'] == 'Player_1' else row['Imp_Prob_2'] for index, row in df_long.iterrows()]

df_long = df_long[['PlayerName', 'Prob', 'Win']]

data_player_odds = df_long.groupby('PlayerName').agg(
    AvgProb=('Prob', 'mean'),
    MatchesPlayed=('Win', 'count'),
).reset_index()

print(data_player_odds)
data_player_odds.to_csv('./../data/data_player_odds.csv', index=True)


         PlayerName   AvgProb  MatchesPlayed
0          Aiava D.  0.277904              3
1    Alexandrova E.  0.558492             49
2      Andreescu B.  0.577515             18
3       Andreeva E.  0.451859             22
4       Andreeva M.  0.677573             58
..              ...       ...            ...
297        Zheng Q.  0.709904             52
298        Zheng S.  0.169602              8
299        Zheng W.  0.380228              1
300          Zhu L.  0.437984             10
301     Zidansek T.  0.382660              9

[302 rows x 3 columns]
