# Clean Data

This notebook will clean and randomize the data extracted from https://github.com/JeffSackmann/tennis_atp, the infosys API, and Odds Data from tennis-abstract.

## Import, Merge and Clean Data

In [10]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
from collections import defaultdict
import re
import warnings

Import Data

In [11]:
#Jeff Sackmann ATP Datasets
jeff_data = pd.DataFrame()
for year in range(1991, 2025):
    file = "./data/all/atp_matches_"+str(year)+".csv"
    
    year_data = pd.read_csv(file)

    jeff_data = pd.concat([jeff_data, year_data], axis=0)
    
jeff_data['tourney_date'] = pd.to_datetime(jeff_data['tourney_date'], format='%Y%m%d')
jeff_data.loc[jeff_data['winner_id'] == 209870, 'winner_id'] = 211326
jeff_data.loc[jeff_data['loser_id'] == 209870, 'loser_id'] = 211326

print(len(jeff_data))
jeff_data.head(5)

108375


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,1991-339,Adelaide,Hard,32,A,1990-12-31,1,101723,,,Magnus Larsson,R,193.0,SWE,20.7,101414,1.0,,Boris Becker,R,190.0,GER,23.1,6-4 3-6 7-6(2),3,R32,130.0,6.0,2.0,96.0,55.0,39.0,25.0,15.0,2.0,4.0,8.0,3.0,95.0,62.0,44.0,23.0,16.0,6.0,8.0,56.0,,2.0,
1,1991-339,Adelaide,Hard,32,A,1990-12-31,2,100946,,Q,Slobodan Zivojinovic,R,198.0,YUG,27.4,101256,,,Mark Kratzmann,L,178.0,AUS,24.6,6-3 3-6 7-6(6),3,R32,119.0,19.0,4.0,101.0,56.0,45.0,25.0,15.0,9.0,10.0,8.0,2.0,84.0,41.0,35.0,27.0,15.0,1.0,2.0,304.0,,75.0,
2,1991-339,Adelaide,Hard,32,A,1990-12-31,3,101234,,,Patrik Kuhnen,R,190.0,GER,24.8,101421,,,Veli Paloheimo,R,183.0,FIN,23.0,6-0 6-4,3,R32,71.0,6.0,1.0,54.0,31.0,24.0,13.0,8.0,1.0,1.0,2.0,2.0,60.0,37.0,22.0,6.0,8.0,4.0,8.0,82.0,,69.0,
3,1991-339,Adelaide,Hard,32,A,1990-12-31,4,101889,8.0,,Todd Woodbridge,R,178.0,AUS,19.7,101703,,,Guillaume Raoux,R,180.0,FRA,20.8,7-6(2) 6-1,3,R32,85.0,2.0,0.0,60.0,40.0,30.0,14.0,9.0,3.0,3.0,3.0,3.0,74.0,45.0,30.0,11.0,10.0,5.0,8.0,50.0,,84.0,
4,1991-339,Adelaide,Hard,32,A,1990-12-31,5,101274,,,Udo Riglewski,R,185.0,GER,24.4,101843,4.0,,Sergi Bruguera,R,188.0,ESP,19.9,7-5 6-3,3,R32,90.0,4.0,2.0,72.0,40.0,33.0,14.0,10.0,7.0,8.0,2.0,2.0,77.0,41.0,28.0,15.0,11.0,4.0,8.0,88.0,,28.0,


In [14]:
#Infosys API Dataset (from 0.5.DataScrape2025)
matches_2025 = pd.read_csv("./data/all/atp_matches_scrape_2025.csv", index_col=0)
print(len(matches_2025))
matches_2025.head()

1989


Unnamed: 0,tourney_name,tourney_id,round,winner_name,winner_ioc,loser_name,loser_ioc,score,tourney_date,draw_size,surface,winner_id,loser_id,winner_ht,loser_ht,winner_age,loser_age,tourney_level,best_of,winner_rank,winner_rank_points,loser_rank,loser_rank_points,w_ace,l_ace,w_df,l_df,w_SvGms,l_SvGms,w_1stIn,l_1stIn,w_1stWon,l_1stWon,w_2ndWon,l_2ndWon,w_svpt,l_svpt,w_bpSaved,l_bpSaved,w_bpFaced,l_bpFaced
0,Brisbane,339,F,Jiri Lehecka,cze,Reilly Opelka,usa,4-1,2024-12-29,32.0,Hard,208103,124187,185.0,211.0,23.1,27.3,A,3,28.0,1660.0,293.0,176.0,,,,,,,,,,,,,,,,,,
1,Brisbane,339,SF,Jiri Lehecka,cze,Grigor Dimitrov,bul,6-4 4-4,2024-12-29,32.0,Hard,208103,105777,185.0,191.0,23.1,33.6,A,3,28.0,1660.0,10.0,3350.0,4.0,6.0,3.0,2.0,9.0,9.0,26.0,32.0,22.0,24.0,17.0,12.0,39.0,36.0,0.0,1.0,0.0,2.0
2,Brisbane,339,SF,Reilly Opelka,usa,Giovanni Mpetshi Perricard,fra,6-3 7-6(4),2024-12-29,32.0,Hard,124187,208659,211.0,203.0,27.3,21.5,A,3,293.0,176.0,31.0,1561.0,12.0,10.0,2.0,5.0,11.0,10.0,45.0,48.0,38.0,38.0,13.0,9.0,51.0,47.0,4.0,1.0,4.0,2.0
3,Brisbane,339,QF,Reilly Opelka,usa,Novak Djokovic,srb,7-6(6) 6-3,2024-12-29,32.0,Hard,124187,104925,211.0,188.0,27.3,37.6,A,3,293.0,176.0,7.0,3910.0,16.0,8.0,1.0,1.0,11.0,10.0,49.0,53.0,38.0,38.0,13.0,12.0,51.0,50.0,1.0,4.0,1.0,5.0
4,Brisbane,339,QF,Grigor Dimitrov,bul,Jordan Thompson,aus,6-1 2-1,2024-12-29,32.0,Hard,105777,111442,191.0,183.0,33.6,30.7,A,3,10.0,3350.0,26.0,1745.0,1.0,1.0,1.0,1.0,5.0,5.0,21.0,12.0,19.0,7.0,5.0,6.0,24.0,13.0,0.0,1.0,0.0,4.0


In [49]:
#Odds Dataset tennis-data.co.uk/alldata.php
odds_data = pd.DataFrame()

warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

for year in range(2001, 2026):
    for ext, engine in [('.xlsx', 'openpyxl'), ('.xls', 'xlrd')]:
        file = f"./data/odds/{year}{ext}"
        if os.path.exists(file):
            try:
                year_data = pd.read_excel(file, engine=engine)
                odds_data = pd.concat([odds_data, year_data], ignore_index=True)
                break
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue
    else:
        print(f"Warning: no .xls or .xlsx file found for {year}")

print(len(odds_data))
odds_data.tail(5)

65541


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,CBW,CBL,GBW,GBL,IWW,IWL,SBW,SBL,B365W,B365L,B&WW,B&WL,EXW,EXL,PSW,PSL,WPts,LPts,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
65536,42,Washington,Citi Open,2025-07-26,ATP500,Outdoor,Hard,Quarterfinals,3.0,Shelton B.,Tiafoe F.,8.0,11.0,7.0,6.0,6.0,4.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.4,3.0,,,,,1.47,2.91,3330.0,2940.0,,,,,,,1.47,3.0,1.43,2.78
65537,42,Washington,Citi Open,2025-07-26,ATP500,Outdoor,Hard,Quarterfinals,3.0,Davidovich Fokina A.,Fritz T.,26.0,4.0,7.0,6.0,3.0,6.0,7.0,5.0,,,,,2.0,1.0,Completed,,,,,,,,,3.75,1.29,,,,,3.77,1.32,1945.0,5035.0,,,,,,,3.77,1.32,3.69,1.28
65538,42,Washington,Citi Open,2025-07-27,ATP500,Outdoor,Hard,Semifinals,3.0,De Minaur A.,Moutet C.,13.0,59.0,6.0,4.0,6.0,3.0,,,,,,,2.0,0.0,Completed,,,,,,,,,1.15,5.5,,,,,1.19,5.44,2885.0,944.0,,,,,,,1.2,5.5,1.17,5.0
65539,42,Washington,Citi Open,2025-07-27,ATP500,Outdoor,Hard,Semifinals,3.0,Davidovich Fokina A.,Shelton B.,26.0,8.0,6.0,2.0,7.0,5.0,,,,,,,2.0,0.0,Completed,,,,,,,,,3.4,1.33,,,,,3.63,1.33,1945.0,3330.0,,,,,,,3.63,1.33,3.4,1.31
65540,42,Washington,Citi Open,2025-07-27,ATP500,Outdoor,Hard,The Final,3.0,De Minaur A.,Davidovich Fokina A.,13.0,26.0,5.0,7.0,6.0,1.0,7.0,6.0,,,,,2.0,1.0,Completed,,,,,,,,,1.4,3.0,,,,,1.44,3.0,2885.0,1945.0,,,,,,,1.45,3.0,1.42,2.8


### Player Elo Rating Calculation (based on ultimatetennisstatistics.com method)

In [50]:
def elo_calculation(df1,
                   df2,
                   id1_col='winner_id',
                   id2_col='loser_id',
                   date_col='tourney_date',
                   round_col='round',
                   level_col='tourney_level',
                   best_of_col='best_of',
                   surface_col='surface',
                   start_elo=1500):
    """
    Processes df1 then df2 in chronological order,
    carrying over player Elo ratings from df1.
    Returns: (df1_out, df2_out), each with the added columns:
      - winner_elo, loser_elo
      - winner_surface_elo, loser_surface_elo
    """
    #Multipliers for tournament/round/best‑of
    Tournament_Coeff = {"G":1.0, "F":0.9, "M":0.85, "A":0.75}
    Round_Coeff      = {"F":1.0,"BR":0.95,"SF":0.9,"QF":0.85,
                        "RR":0.85,"R16":0.8,"R32":0.8,
                        "R64":0.75,"R128":0.75,"ER":0.75}
    Best_Of_Coeff    = {3:0.9, 5:1.0}
    
    round_rank = {
      'RR': 0, 'R128': 1, 'R64': 2, 'R32': 3, 'R16': 4,
      'QF': 5, 'SF':  6, 'BR':  7,
      'F':    8
    }

    elo = defaultdict(lambda: start_elo)
    surface_elo = defaultdict(lambda: defaultdict(lambda: start_elo))
    
    #sort dataframes by tourney_date and then round
    def _annotate(df):
        df = df.assign(
            _round_rank = df[round_col].map(round_rank).fillna(99)
        )
        df = (
          df
          .sort_values([date_col, '_round_rank'])
          .reset_index(drop=True)
        )
        
        p1_elo_b, p2_elo_b = [], []
        p1_srf_elo_b, p2_srf_elo_b = [], []
        
        for _, row in df.iterrows():
            p1, p2 = row[id1_col], row[id2_col]
            lvl, rnd = row[level_col], row[round_col]
            bof, srf = row[best_of_col], row[surface_col]

            r1, r2 = elo[p1], elo[p2]
            s1, s2 = surface_elo[p1][srf], surface_elo[p2][srf]

            # record Elos
            p1_elo_b.append(r1)
            p2_elo_b.append(r2)
            p1_srf_elo_b.append(s1)
            p2_srf_elo_b.append(s2)

            # rating‐factor formula
            rf1 = 1 + 18/(1 + 2**((r1-1500)/63))
            rf2 = 1 + 18/(1 + 2**((r2-1500)/63))
            rs1 = 1 + 18/(1 + 2**((s1-1500)/63))
            rs2 = 1 + 18/(1 + 2**((s2-1500)/63))
            
            def get_k(t_coeff, r_coeff, b_coeff, rf):
                return 32 * t_coeff * r_coeff * b_coeff * rf
            
            k1 = get_k(Tournament_Coeff.get(lvl,0.75),
                       Round_Coeff.get(rnd,0.8),
                       Best_Of_Coeff.get(bof,0.9),
                       rf1)
            k2 = get_k(Tournament_Coeff.get(lvl,0.75),
                       Round_Coeff.get(rnd,0.8),
                       Best_Of_Coeff.get(bof,0.9),
                       rf2)
            ks1 = get_k(Tournament_Coeff.get(lvl,0.75),
                        Round_Coeff.get(rnd,0.8),
                        Best_Of_Coeff.get(bof,0.9),
                        rs1)
            ks2 = get_k(Tournament_Coeff.get(lvl,0.75),
                        Round_Coeff.get(rnd,0.8),
                        Best_Of_Coeff.get(bof,0.9),
                        rs2)

            # expectations
            exp1 = 1.0 / (1.0 + 10**((r2-r1)/400))
            exp2 = 1.0 / (1.0 + 10**((r1-r2)/400))
            exps1 = 1.0 / (1.0 + 10**((s2-s1)/400))
            exps2 = 1.0 / (1.0 + 10**((s1-s2)/400))

            elo[p1] += k1*(1-exp1)
            elo[p2] += k2*(0-exp2)
            surface_elo[p1][srf] += ks1*(1-exps1)
            surface_elo[p2][srf] += ks2*(0-exps2)

        df['winner_elo'] = p1_elo_b
        df['loser_elo']  = p2_elo_b
        df['winner_surface_elo'] = p1_srf_elo_b
        df['loser_surface_elo']  = p2_srf_elo_b
        return df

    out1 = _annotate(df1)
    out2 = _annotate(df2)
    return out1, out2


In [51]:
jeff_data, matches_2025 = elo_calculation(jeff_data, matches_2025)
all_data = pd.concat([jeff_data, matches_2025], ignore_index=True)
all_data = all_data.drop(columns='_round_rank')

### Merge Stats with Odds Data (all_data, odds_data)
1. Clean Odds Dataset

In [52]:
#Odds columns to numeric
cols_to_numeric = [
    'CBW', 'GBW', 'IWW', 'SBW', 'B365W', 'B&WW', 'EXW', 'PSW',
    'CBL', 'GBL', 'IWL', 'SBL', 'B365L', 'B&WL', 'EXL', 'PSL'
]
for col in cols_to_numeric:
    odds_data[col] = pd.to_numeric(odds_data[col], errors='coerce')

#Calculate best odds across markets
odds_data['winner_Max'] = odds_data.loc[:, ['CBW', 'GBW', 'IWW', 'SBW', 'B365W', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW']].max(axis=1)
odds_data['loser_Max'] = odds_data.loc[:, ['CBL', 'GBL', 'IWL', 'SBL', 'B365L', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL']].max(axis=1)

#Calculate odds inferred win probability
odds_data['winner_winprob'] = np.where(
    odds_data['PSW'].notna() & odds_data['PSL'].notna(),
    (1 / odds_data['PSW']) / ((1 / odds_data['PSW']) + (1 / odds_data['PSL'])),
    np.where(
        odds_data['B365W'].notna() & odds_data['B365L'].notna(),
        (1 / odds_data['B365W']) / ((1 / odds_data['B365W']) + (1 / odds_data['B365L'])),
        np.nan
    )
)
odds_data['loser_winprob'] = np.where(
    odds_data['PSW'].notna() & odds_data['PSL'].notna(),
    (1 / odds_data['PSL']) / ((1 / odds_data['PSW']) + (1 / odds_data['PSL'])),
    np.where(
        odds_data['B365W'].notna() & odds_data['B365L'].notna(),
        (1 / odds_data['B365L']) / ((1 / odds_data['B365W']) + (1 / odds_data['B365L'])),
        np.nan
    )
)

odds_data.dropna(subset=['winner_winprob','loser_winprob'], inplace=True)
odds_data = odds_data.drop(['MaxW', 'MaxL', 'AvgW', 'AvgL', 'ATP', 'Location', 'Tournament', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'Winner', 'Loser', 'W1', 'W2', 'W3', 'W4', 'W5', 'L1', 'L2', 'L3', 'L4', 'L5', 'Wsets', 'Lsets', 'Comment', 'CBW', 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL', 'B365W', 'B365L', 'B&WW', 'B&WL', 'EXW', 'EXL', 'PSW', 'PSL', 'WPts', 'LPts', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL'], axis=1)


2. Clean all_data

In [53]:
all_data['tourney_date'] = pd.to_datetime(all_data['tourney_date'])
all_data = all_data.loc[(all_data['tourney_date'] >= '2000-01-01')]

all_data = all_data.dropna(subset=['winner_id', 'loser_id', 'winner_rank_points', 'loser_rank_points', 'winner_rank', 'loser_rank', "surface", "tourney_date"])

for df, cols in [(all_data, ['winner_rank','loser_rank']),
                 (odds_data,      ['WRank','LRank'])]:
    for c in cols:
        df[c] = df[c].round(1)
        
all_data = all_data.reset_index(drop=True)
odds_data = odds_data.reset_index(drop=True)

3. Merge Dataframes on date and rank

In [54]:
# Prepare both frames: rename, drop NaNs, ensure datetimes
odds_tmp = (
    odds_data
    .rename(columns={'WRank':'winner_rank','LRank':'loser_rank'})
    .dropna(subset=['winner_rank','loser_rank','Date'])
)
all_tmp = all_data.copy()
odds_tmp['Date'] = pd.to_datetime(odds_tmp['Date'])

def merge_group(left_grp, right_grp, days_tol=3):
    left_grp  = left_grp.sort_values('tourney_date')
    right_grp = right_grp.sort_values('Date')
    return pd.merge_asof(
        left_grp, right_grp,
        left_on   = 'tourney_date',
        right_on  = 'Date',
        tolerance = pd.Timedelta(days=days_tol),
        direction = 'nearest',
        suffixes  = ('','_odds')
    )

# Loop over each (winner_rank, loser_rank) pair
merged_pieces = []
for (wr, lr), left_grp in all_tmp.groupby(['winner_rank','loser_rank'], sort=False):
    right_grp = odds_tmp[
        (odds_tmp['winner_rank'] == wr) &
        (odds_tmp['loser_rank']  == lr)
    ]
    if right_grp.empty:
        merged_pieces.append(left_grp.assign(**{c: pd.NA for c in odds_tmp.columns if c not in ['winner_rank','loser_rank','Date']}))
    else:
        merged_pieces.append(merge_group(left_grp, right_grp, days_tol=3))

all_data_odds = pd.concat(merged_pieces, ignore_index=True)
all_data_odds = all_data_odds.drop(columns=['Date'])
all_data_odds.dropna(subset=['winner_winprob','loser_winprob'], inplace=True)
print(len(all_data_odds))
all_data_odds.head(5)

  all_data_odds = pd.concat(merged_pieces, ignore_index=True)


43278


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_elo,loser_elo,winner_surface_elo,loser_surface_elo,winner_rank_odds,loser_rank_odds,winner_Max,loser_Max,winner_winprob,loser_winprob
1,2009-500,Halle,Grass,32.0,A,2009-06-08,24.0,104925,2.0,,Novak Djokovic,R,188.0,SRB,22.0,103722,,,Florent Serra,R,180.0,FRA,28.2,5-7 7-5 6-1,3,R16,125.0,14.0,1.0,87.0,46.0,36.0,24.0,15.0,11.0,13.0,7.0,3.0,107.0,67.0,43.0,20.0,16.0,6.0,11.0,4.0,8020.0,56.0,1056.0,2270.604487,1890.351567,1871.077328,1631.170311,4.0,56.0,1.05,12.0,0.921659,0.078341
2,2013-747,Beijing,Hard,32.0,A,2013-09-30,19.0,103970,3.0,,David Ferrer,R,175.0,ESP,31.4,104594,,,Marinko Matosevic,R,194.0,AUS,28.1,5-7 6-1 6-2,3,R16,119.0,3.0,4.0,90.0,51.0,37.0,20.0,14.0,5.0,8.0,2.0,5.0,74.0,40.0,23.0,12.0,13.0,5.0,11.0,4.0,6710.0,56.0,846.0,2231.646862,1879.416941,2143.254219,1863.985457,4.0,56.0,1.35,3.56,0.726531,0.273469
8,2019-7648,Budapest,Clay,32.0,A,2019-04-22,271.0,104898,,,Robin Haase,R,191.0,NED,32.0,105341,,,Thomas Fabbiano,R,173.0,ITA,29.9,6-7(4) 6-3 6-2,3,R32,145.0,5.0,2.0,88.0,55.0,40.0,20.0,14.0,4.0,5.0,0.0,3.0,119.0,86.0,47.0,20.0,15.0,7.0,12.0,64.0,835.0,91.0,631.0,1840.131052,1773.604025,1746.737532,1611.769889,64.0,91.0,1.5,2.79,0.651869,0.348131
12,2009-421,Canada Masters,Hard,56.0,M,2009-08-10,12.0,103908,,,Paul Henri Mathieu,R,185.0,FRA,27.5,104198,,,Guillermo Garcia Lopez,R,188.0,ESP,26.1,7-5 0-6 6-1,3,R64,115.0,9.0,5.0,81.0,36.0,28.0,23.0,13.0,4.0,7.0,5.0,2.0,65.0,42.0,28.0,14.0,12.0,0.0,3.0,27.0,1495.0,54.0,924.0,1931.653767,1913.546012,1901.817136,1740.656489,27.0,54.0,1.45,2.85,0.662651,0.337349
13,2010-807,Acapulco,Clay,32.0,A,2010-02-22,20.0,104597,6.0,,Nicolas Almagro,R,183.0,ESP,24.5,104755,,,Richard Gasquet,R,185.0,FRA,23.6,3-6 7-5 7-6(7),3,R16,154.0,8.0,1.0,105.0,53.0,36.0,30.0,16.0,2.0,5.0,10.0,4.0,114.0,58.0,44.0,28.0,17.0,4.0,7.0,27.0,1370.0,54.0,810.0,1968.242564,2011.084775,1947.268855,1892.491381,27.0,54.0,1.649,2.42,0.594741,0.405259


### Feature Engineering
1. Calculate set by set scores

In [55]:
def split_match_scores(df, score_col='score', max_sets=5):
    """
    Parses df[score_col] into wide columns:
      – winner_set1…winner_set5, loser_set1…loser_set5
      – tiebreak_occurred (1 if any set had a () or [] tiebreak, else 0)
      – tiebreaks_won_winner, tiebreaks_won_loser (counts, NaN if none)
      – winner_margin1…winner_margin5, loser_margin1…loser_margin5
        (games_diff = player_games − opponent_games)
    Keeps all fully completed sets in RET/DEF/W/O matches, but blanks only
    the final incomplete set.
    """

    if df is None or not hasattr(df, "index"):
        raise ValueError("Input dataframe 'df' must be a valid pandas DataFrame, got None or invalid type.")

    # save & reset index
    orig_idx = df.index
    df2 = df.reset_index(drop=True).copy()

    # extract sets + tiebreak flags
    pat = re.compile(r'(?P<w>\d+)-(?P<l>\d+)(?P<tb>[\(\[]\d+(?:-\d+)?[\)\]])?')
    ext = df2[score_col].str.extractall(pat)
    ext[['w','l']] = ext[['w','l']].astype(int)
    ext['tb_flag'] = ext['tb'].notna().astype(int)

    # unstack into wide form
    winner = ext['w'].unstack(level=1)
    loser  = ext['l'].unstack(level=1)
    tb     = ext['tb_flag'].unstack(level=1)

    # widen 0-based cols to 1..max_sets
    def widen(piv, prefix, fill=np.nan):
        piv = piv.copy()
        piv.columns = [f"{prefix}{i+1}" for i in piv.columns]
        for i in range(1, max_sets+1):
            col = f"{prefix}{i}"
            if col not in piv:
                piv[col] = fill
        return piv[[f"{prefix}{i}" for i in range(1, max_sets+1)]]

    winner_sets = widen(winner, 'w_set')
    loser_sets  = widen(loser,  'l_set')
    tb_flags    = widen(tb,     'tb_flag', fill=0)

    out = pd.concat([df2, winner_sets, loser_sets], axis=1)
    
    # handle match completion flags
    retire = out[score_col].str.contains(r'\bRET\b|\bDEF\b|W/O',
                                         case=False, na=False)
    for i in range(1, max_sets+1):
        wcol, lcol = f"w_set{i}", f"l_set{i}"
        no_tb = tb_flags[f"tb_flag{i}"] == 0
        incomplete = (out[wcol].fillna(0) < 6) & (out[lcol].fillna(0) < 6) & no_tb
        out.loc[retire & incomplete, [wcol, lcol]] = np.nan

    # match-level tiebreak indicator & counts
    out['tiebreak_occurred'] = (tb_flags.sum(axis=1) > 0).astype(int)
    win_tbs  = [(tb_flags[f"tb_flag{i}"]==1) & (out[f"w_set{i}"] > out[f"l_set{i}"])
                 for i in range(1, max_sets+1)]
    lose_tbs = [(tb_flags[f"tb_flag{i}"]==1) & (out[f"l_set{i}"]  > out[f"w_set{i}"])
                 for i in range(1, max_sets+1)]
    out['w_tiebreaks_won'] = pd.concat(win_tbs, axis=1).sum(axis=1)
    out['l_tiebreaks_won']  = pd.concat(lose_tbs,axis=1).sum(axis=1)
    no_tb_match = out['tiebreak_occurred'] == 0
    out.loc[no_tb_match, ['w_tiebreaks_won','l_tiebreaks_won']] = np.nan

    # per-set margins
    for i in range(1, max_sets+1):
        out[f"w_margin{i}"] = out[f"w_set{i}"] - out[f"l_set{i}"]
        out[f"l_margin{i}"]  = out[f"l_set{i}"]  - out[f"w_set{i}"]

    out.index = orig_idx
    return out

if all_data_odds is None or not hasattr(all_data_odds, "index"):
    print("Error: all_data_odds is not a valid DataFrame. Please check the previous steps.")
else:
    all_data_odds = split_match_scores(all_data_odds, score_col='score', max_sets=5)

all_data_odds.head(5)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_elo,loser_elo,winner_surface_elo,loser_surface_elo,winner_rank_odds,loser_rank_odds,winner_Max,loser_Max,winner_winprob,loser_winprob,w_set1,w_set2,w_set3,w_set4,w_set5,l_set1,l_set2,l_set3,l_set4,l_set5,tiebreak_occurred,w_tiebreaks_won,l_tiebreaks_won,w_margin1,l_margin1,w_margin2,l_margin2,w_margin3,l_margin3,w_margin4,l_margin4,w_margin5,l_margin5
1,2009-500,Halle,Grass,32.0,A,2009-06-08,24.0,104925,2.0,,Novak Djokovic,R,188.0,SRB,22.0,103722,,,Florent Serra,R,180.0,FRA,28.2,5-7 7-5 6-1,3,R16,125.0,14.0,1.0,87.0,46.0,36.0,24.0,15.0,11.0,13.0,7.0,3.0,107.0,67.0,43.0,20.0,16.0,6.0,11.0,4.0,8020.0,56.0,1056.0,2270.604487,1890.351567,1871.077328,1631.170311,4.0,56.0,1.05,12.0,0.921659,0.078341,5.0,7.0,6.0,,,7.0,5.0,1.0,,,0.0,,,-2.0,2.0,2.0,-2.0,5.0,-5.0,,,,
2,2013-747,Beijing,Hard,32.0,A,2013-09-30,19.0,103970,3.0,,David Ferrer,R,175.0,ESP,31.4,104594,,,Marinko Matosevic,R,194.0,AUS,28.1,5-7 6-1 6-2,3,R16,119.0,3.0,4.0,90.0,51.0,37.0,20.0,14.0,5.0,8.0,2.0,5.0,74.0,40.0,23.0,12.0,13.0,5.0,11.0,4.0,6710.0,56.0,846.0,2231.646862,1879.416941,2143.254219,1863.985457,4.0,56.0,1.35,3.56,0.726531,0.273469,5.0,6.0,6.0,,,7.0,1.0,2.0,,,0.0,,,-2.0,2.0,5.0,-5.0,4.0,-4.0,,,,
8,2019-7648,Budapest,Clay,32.0,A,2019-04-22,271.0,104898,,,Robin Haase,R,191.0,NED,32.0,105341,,,Thomas Fabbiano,R,173.0,ITA,29.9,6-7(4) 6-3 6-2,3,R32,145.0,5.0,2.0,88.0,55.0,40.0,20.0,14.0,4.0,5.0,0.0,3.0,119.0,86.0,47.0,20.0,15.0,7.0,12.0,64.0,835.0,91.0,631.0,1840.131052,1773.604025,1746.737532,1611.769889,64.0,91.0,1.5,2.79,0.651869,0.348131,6.0,6.0,6.0,,,7.0,3.0,2.0,,,1.0,0.0,1.0,-1.0,1.0,3.0,-3.0,4.0,-4.0,,,,
12,2009-421,Canada Masters,Hard,56.0,M,2009-08-10,12.0,103908,,,Paul Henri Mathieu,R,185.0,FRA,27.5,104198,,,Guillermo Garcia Lopez,R,188.0,ESP,26.1,7-5 0-6 6-1,3,R64,115.0,9.0,5.0,81.0,36.0,28.0,23.0,13.0,4.0,7.0,5.0,2.0,65.0,42.0,28.0,14.0,12.0,0.0,3.0,27.0,1495.0,54.0,924.0,1931.653767,1913.546012,1901.817136,1740.656489,27.0,54.0,1.45,2.85,0.662651,0.337349,7.0,0.0,6.0,,,5.0,6.0,1.0,,,0.0,,,2.0,-2.0,-6.0,6.0,5.0,-5.0,,,,
13,2010-807,Acapulco,Clay,32.0,A,2010-02-22,20.0,104597,6.0,,Nicolas Almagro,R,183.0,ESP,24.5,104755,,,Richard Gasquet,R,185.0,FRA,23.6,3-6 7-5 7-6(7),3,R16,154.0,8.0,1.0,105.0,53.0,36.0,30.0,16.0,2.0,5.0,10.0,4.0,114.0,58.0,44.0,28.0,17.0,4.0,7.0,27.0,1370.0,54.0,810.0,1968.242564,2011.084775,1947.268855,1892.491381,27.0,54.0,1.649,2.42,0.594741,0.405259,3.0,7.0,7.0,,,6.0,5.0,6.0,,,1.0,1.0,0.0,-3.0,3.0,2.0,-2.0,1.0,-1.0,,,,


2. Derive advanced stats

In [56]:
all_data_odds['w_bpconv'] = all_data_odds['l_bpFaced'] - all_data_odds['l_bpSaved']
all_data_odds['l_bpconv'] = all_data_odds['w_bpFaced'] - all_data_odds['w_bpSaved']
all_data_odds['w_bp/rg'] = all_data_odds['l_bpFaced']/all_data_odds['l_SvGms']
all_data_odds['l_bp/rg'] = all_data_odds['w_bpFaced']/all_data_odds['w_SvGms']
all_data_odds['w_tpw'] = all_data_odds['w_1stWon'] + all_data_odds['w_2ndWon'] + (all_data_odds['l_svpt'] - (all_data_odds['l_1stWon'] + all_data_odds['l_2ndWon']))
all_data_odds['l_tpw'] = all_data_odds['l_1stWon'] + all_data_odds['l_2ndWon'] + (all_data_odds['w_svpt'] - (all_data_odds['w_1stWon'] + all_data_odds['w_2ndWon']))
all_data_odds['w_sw'] = all_data_odds[['w_set1', 'w_set2', 'w_set3', 'w_set4', 'w_set5']].sum(axis=1, skipna=True)
all_data_odds['l_sw'] = all_data_odds[['l_set1', 'l_set2', 'l_set3', 'l_set4', 'l_set5']].sum(axis=1, skipna=True)

## Randomize Winner and Loser

Replace winner and loser column tags with Player1 (P1) & Player2 (P2)

In [57]:
columnsWinner = [
    "winner_id","winner_seed", "winner_entry", "winner_name", "winner_hand", "winner_ht", "winner_ioc", "winner_age", "w_ace", "w_df", "w_svpt", "w_1stIn", "w_1stWon", "w_2ndWon", "w_SvGms", "w_bpSaved", "w_bpFaced", "winner_rank","winner_rank_points", 'w_set1', 'w_set2', 'w_set3', 'w_set4', 'w_set5',  'w_tiebreaks_won','w_margin1','w_margin2','w_margin3','w_margin4','w_margin5','w_bpconv','w_bp/rg','w_tpw','w_sw', 'winner_Max', 'winner_winprob', 'winner_elo', 'winner_surface_elo'

]
columnsLoser = [
    "loser_id","loser_seed", "loser_entry", "loser_name", "loser_hand", "loser_ht", "loser_ioc", "loser_age", "l_ace", "l_df", "l_svpt", "l_1stIn", "l_1stWon", "l_2ndWon", "l_SvGms", "l_bpSaved", "l_bpFaced", "loser_rank","loser_rank_points",  'l_set1', 'l_set2', 'l_set3', 'l_set4', 'l_set5',  'l_tiebreaks_won', 'l_margin1', 'l_margin2', 'l_margin3', 'l_margin4', 'l_margin5', 'l_bpconv', 'l_bp/rg', 'l_tpw','l_sw', 'loser_Max', 'loser_winprob', 'loser_elo', 'loser_surface_elo'

]

# Rename mapping
rename_dict = {col: col.replace("winner", "p1").replace("w_", "p1_") for col in columnsWinner}
rename_dict.update({col: col.replace("loser", "p2").replace("l_", "p2_") for col in columnsLoser})

all_data_odds = all_data_odds.rename(columns=rename_dict)
all_data_odds.head(5)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,p1_id,p1_seed,p1_entry,p1_name,p1_hand,p1_ht,p1_ioc,p1_age,p2_id,p2_seed,p2_entry,p2_name,p2_hand,p2_ht,p2_ioc,p2_age,score,best_of,round,minutes,p1_ace,p1_df,p1_svpt,p1_1stIn,p1_1stWon,p1_2ndWon,p1_SvGms,p1_bpSaved,p1_bpFaced,p2_ace,p2_df,p2_svpt,p2_1stIn,p2_1stWon,p2_2ndWon,p2_SvGms,p2_bpSaved,p2_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,p1_elo,p2_elo,p1_surface_elo,p2_surface_elo,winner_rank_odds,loser_rank_odds,p1_Max,p2_Max,p1_winprob,p2_winprob,p1_set1,p1_set2,p1_set3,p1_set4,p1_set5,p2_set1,p2_set2,p2_set3,p2_set4,p2_set5,tiebreak_occurred,p1_tiebreaks_won,p2_tiebreaks_won,p1_margin1,p2_margin1,p1_margin2,p2_margin2,p1_margin3,p2_margin3,p1_margin4,p2_margin4,p1_margin5,p2_margin5,p1_bpconv,p2_bpconv,p1_bp/rg,p2_bp/rg,p1_tpw,p2_tpw,p1_sw,p2_sw
1,2009-500,Halle,Grass,32.0,A,2009-06-08,24.0,104925,2.0,,Novak Djokovic,R,188.0,SRB,22.0,103722,,,Florent Serra,R,180.0,FRA,28.2,5-7 7-5 6-1,3,R16,125.0,14.0,1.0,87.0,46.0,36.0,24.0,15.0,11.0,13.0,7.0,3.0,107.0,67.0,43.0,20.0,16.0,6.0,11.0,4.0,8020.0,56.0,1056.0,2270.604487,1890.351567,1871.077328,1631.170311,4.0,56.0,1.05,12.0,0.921659,0.078341,5.0,7.0,6.0,,,7.0,5.0,1.0,,,0.0,,,-2.0,2.0,2.0,-2.0,5.0,-5.0,,,,,5.0,2.0,0.6875,0.866667,104.0,90.0,18.0,13.0
2,2013-747,Beijing,Hard,32.0,A,2013-09-30,19.0,103970,3.0,,David Ferrer,R,175.0,ESP,31.4,104594,,,Marinko Matosevic,R,194.0,AUS,28.1,5-7 6-1 6-2,3,R16,119.0,3.0,4.0,90.0,51.0,37.0,20.0,14.0,5.0,8.0,2.0,5.0,74.0,40.0,23.0,12.0,13.0,5.0,11.0,4.0,6710.0,56.0,846.0,2231.646862,1879.416941,2143.254219,1863.985457,4.0,56.0,1.35,3.56,0.726531,0.273469,5.0,6.0,6.0,,,7.0,1.0,2.0,,,0.0,,,-2.0,2.0,5.0,-5.0,4.0,-4.0,,,,,6.0,3.0,0.846154,0.571429,96.0,68.0,17.0,10.0
8,2019-7648,Budapest,Clay,32.0,A,2019-04-22,271.0,104898,,,Robin Haase,R,191.0,NED,32.0,105341,,,Thomas Fabbiano,R,173.0,ITA,29.9,6-7(4) 6-3 6-2,3,R32,145.0,5.0,2.0,88.0,55.0,40.0,20.0,14.0,4.0,5.0,0.0,3.0,119.0,86.0,47.0,20.0,15.0,7.0,12.0,64.0,835.0,91.0,631.0,1840.131052,1773.604025,1746.737532,1611.769889,64.0,91.0,1.5,2.79,0.651869,0.348131,6.0,6.0,6.0,,,7.0,3.0,2.0,,,1.0,0.0,1.0,-1.0,1.0,3.0,-3.0,4.0,-4.0,,,,,5.0,1.0,0.8,0.357143,112.0,95.0,18.0,12.0
12,2009-421,Canada Masters,Hard,56.0,M,2009-08-10,12.0,103908,,,Paul Henri Mathieu,R,185.0,FRA,27.5,104198,,,Guillermo Garcia Lopez,R,188.0,ESP,26.1,7-5 0-6 6-1,3,R64,115.0,9.0,5.0,81.0,36.0,28.0,23.0,13.0,4.0,7.0,5.0,2.0,65.0,42.0,28.0,14.0,12.0,0.0,3.0,27.0,1495.0,54.0,924.0,1931.653767,1913.546012,1901.817136,1740.656489,27.0,54.0,1.45,2.85,0.662651,0.337349,7.0,0.0,6.0,,,5.0,6.0,1.0,,,0.0,,,2.0,-2.0,-6.0,6.0,5.0,-5.0,,,,,3.0,3.0,0.25,0.538462,74.0,72.0,13.0,12.0
13,2010-807,Acapulco,Clay,32.0,A,2010-02-22,20.0,104597,6.0,,Nicolas Almagro,R,183.0,ESP,24.5,104755,,,Richard Gasquet,R,185.0,FRA,23.6,3-6 7-5 7-6(7),3,R16,154.0,8.0,1.0,105.0,53.0,36.0,30.0,16.0,2.0,5.0,10.0,4.0,114.0,58.0,44.0,28.0,17.0,4.0,7.0,27.0,1370.0,54.0,810.0,1968.242564,2011.084775,1947.268855,1892.491381,27.0,54.0,1.649,2.42,0.594741,0.405259,3.0,7.0,7.0,,,6.0,5.0,6.0,,,1.0,1.0,0.0,-3.0,3.0,2.0,-2.0,1.0,-1.0,,,,,3.0,3.0,0.411765,0.3125,108.0,111.0,17.0,17.0


Resolve P1 always corresponding to winner by randomly swapping p1 and p2

In [58]:
mask = np.random.rand(len(all_data_odds)) < 0.5 

player1_cols = [col for col in all_data_odds.columns if "player1" in col or "p1_" in col]
player2_cols = [col for col in all_data_odds.columns if "player2" in col or "p2_" in col]

# Create the RESULT column (1 = player1 Win, 0 = Player2 Win)
all_data_odds["RESULT"] = np.where(mask, 0, 1)

# Swap values where mask is True
all_data_odds.loc[mask, player1_cols], all_data_odds.loc[mask, player2_cols] = all_data_odds.loc[mask, player2_cols].values, all_data_odds.loc[mask, player1_cols].values

### Standardize Tournament IDs
1. Identify Tournaments with multiple IDs

In [59]:
all_data_odds['tourney_id'] = all_data_odds['tourney_id'].apply(
    lambda x: str(x)[5:] if len(str(x)) > 4 and str(x)[4] == '-' else x
)

tourney_dict = all_data_odds.groupby('tourney_name')['tourney_id'].apply(
    lambda x: list(x.unique())
).to_dict()

multiple_ids = {name: ids for name, ids in tourney_dict.items() if len(ids) >= 2}

df_export = pd.DataFrame.from_dict(multiple_ids, orient='index')
df_export.index.name = 'tourney_name'
df_export = df_export.reset_index()

df_export.to_csv('./data/Tournament/multiple_ids.csv', index=False)

2. Resolve multiple IDs issue

In [60]:
update_df = pd.read_csv('./data/Tournament/tourney_id_update.csv')

all_data_odds = all_data_odds.merge(update_df, on='tourney_name', how='left')

all_data_odds['tourney_id'] = all_data_odds['New'].fillna(all_data_odds['tourney_id'])

all_data_odds = all_data_odds.drop('New', axis=1)

3. Identify remaining issues (irregular format, shared IDs)

In [61]:
# Group by tourney_id and get unique tournament names for each ID
id_to_names = all_data_odds.groupby('tourney_id')['tourney_name'].apply(
    lambda x: list(x.unique())
).to_dict()

# Filter for IDs that have multiple different tournament names
duplicate_ids = {id_val: names for id_val, names in id_to_names.items() if len(names) > 1}

# Print the results
if duplicate_ids:
    print("Found tournament IDs associated with multiple tournament names:")
    for id_val, names in duplicate_ids.items():
        print(f"ID '{id_val}' is used by: {names}")
else:
    print("No duplicate IDs found across different tournament names.")

Found tournament IDs associated with multiple tournament names:
ID '402.0' is used by: ['Miami Masters', 'Memphis']
ID '495.0' is used by: ['Dubai', 'Dubai Duty Free Tennis Championships']
ID '741.0' is used by: ['Eastbourne', 'Nottingham']
ID '1536.0' is used by: ['Madrid Masters', 'Mutua Madrid Open']
ID '8998.0' is used by: ['Adelaide', 'Adelaide 1', 'Adelaide 2', 'Adelaide International']
ID '306' is used by: ['St. Poelten', 'Poertschach']
ID '433' is used by: ['Las Vegas', 'Scottsdale']
ID '475' is used by: ['Sopot', 'Warsaw']
ID '5053' is used by: ['Belgrade ', 'Belgrade']
ID '560' is used by: ['US Open', 'Us Open']
ID '6932' is used by: ['Rio De Janeiro', 'ATP Rio de Janeiro']
ID '890' is used by: ['Shanghai', 'Ho Chi Minh City']
ID '9410' is used by: ['Nur-Sultan', 'Astana', 'Almaty']


In [62]:
unique_dash_ids = all_data_odds['tourney_id'].unique()
dash_ids = [tid for tid in unique_dash_ids if isinstance(tid, str) and '-' in tid]
dash_ids

[]

In [63]:
unique_tourney_ids_with_letters = [tid for tid in all_data_odds['tourney_id'].unique() if isinstance(tid, str) and any(c.isalpha() for c in tid)]
print(unique_tourney_ids_with_letters)

[]


4. Resolve remaining

In [64]:
id_map ={
    'Miami Masters': 403,
    'Sydney': 338,
    'Madrid Masters': 1536,
    'Rio de Janeiro': 6932
    
}
# Map tourney_name to id_map, otherwise keep original tourney_id
all_data_odds['tourney_id'] = all_data_odds.apply(
    lambda row: id_map[row['tourney_name']] if row['tourney_name'] in id_map else row['tourney_id'],
    axis=1
)

Export

In [66]:
all_data_odds.to_csv("./data/0cleanDataset.csv", index=False)
all_data_odds.tail()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,p1_id,p1_seed,p1_entry,p1_name,p1_hand,p1_ht,p1_ioc,p1_age,p2_id,p2_seed,p2_entry,p2_name,p2_hand,p2_ht,p2_ioc,p2_age,score,best_of,round,minutes,p1_ace,p1_df,p1_svpt,p1_1stIn,p1_1stWon,p1_2ndWon,p1_SvGms,p1_bpSaved,p1_bpFaced,p2_ace,p2_df,p2_svpt,p2_1stIn,p2_1stWon,p2_2ndWon,p2_SvGms,p2_bpSaved,p2_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,p1_elo,p2_elo,p1_surface_elo,p2_surface_elo,winner_rank_odds,loser_rank_odds,p1_Max,p2_Max,p1_winprob,p2_winprob,p1_set1,p1_set2,p1_set3,p1_set4,p1_set5,p2_set1,p2_set2,p2_set3,p2_set4,p2_set5,tiebreak_occurred,p1_tiebreaks_won,p2_tiebreaks_won,p1_margin1,p2_margin1,p1_margin2,p2_margin2,p1_margin3,p2_margin3,p1_margin4,p2_margin4,p1_margin5,p2_margin5,p1_bpconv,p2_bpconv,p1_bp/rg,p2_bp/rg,p1_tpw,p2_tpw,p1_sw,p2_sw,RESULT,0,1,2
43273,540,Wimbledon,Grass,128.0,G,2025-06-30,,207989,,,Carlos Alcaraz,,183.0,esp,22.2,210472,,,Oliver Tarvet,,,gbr,21.7,6-1 6-4 6-4,5,R64,,,,,,,,,,,,,,,,,,,,2.0,9300.0,733.0,36.0,2366.482786,1682.786124,2036.271771,1620.0,2.0,733.0,1.01,12.0,0.922367,0.077633,6.0,6.0,6.0,,,1.0,4.0,4.0,,,0.0,,,5.0,-5.0,2.0,-2.0,2.0,-2.0,,,,,,,,,,,18.0,9.0,1,,,
43274,540,Wimbledon,Grass,128.0,G,2025-06-30,,126094,,,Andrey Rublev,,188.0,rus,27.7,144750,,,Lloyd Harris,,193.0,rsa,28.3,6-7(1) 6-4 7-6(5) 6-3,5,R64,,,,,,,,,,,,,,,,,,,,14.0,2920.0,320.0,159.0,2007.317754,1821.391829,1819.238647,1710.882884,14.0,320.0,1.31,3.81,0.744141,0.255859,6.0,6.0,7.0,6.0,,7.0,4.0,6.0,3.0,,1.0,1.0,1.0,-1.0,1.0,2.0,-2.0,1.0,-1.0,3.0,-3.0,,,,,,,,,25.0,20.0,1,,,
43275,540,Wimbledon,Grass,128.0,G,2025-06-30,,209259,,,Arthur Fery,,175.0,gbr,23.0,209260,,,Luciano Darderi,,183.0,ita,23.4,6-4 6-3 6-3,5,R64,,,,,,,,,,,,,,,,,,,,461.0,91.0,59.0,989.0,1731.284313,1841.872845,1680.408805,1684.751612,59.0,461.0,1.81,2.13,0.540609,0.459391,4.0,3.0,3.0,,,6.0,6.0,6.0,,,0.0,,,-2.0,2.0,-3.0,3.0,-3.0,3.0,,,,,,,,,,,10.0,18.0,0,,,
43276,540,Wimbledon,Grass,128.0,G,2025-06-30,,210530,,,Learner Tien,,180.0,usa,19.6,111797,,,Nicolas Jarry,,201.0,chi,29.7,6-2 6-2 6-3,5,R64,,,,,,,,,,,,,,,,,,,,62.0,940.0,143.0,418.0,1840.485173,1845.417358,1658.876156,1730.975164,143.0,62.0,2.2,1.76,0.444444,0.555556,2.0,2.0,3.0,,,6.0,6.0,6.0,,,0.0,,,-4.0,4.0,-4.0,4.0,-3.0,3.0,,,,,,,,,,,7.0,18.0,0,,,
43277,540,Wimbledon,Grass,128.0,G,2025-06-30,,208316,,,Valentin Royer,,188.0,fra,24.1,105173,,,Adrian Mannarino,,180.0,fra,37.0,6-4 6-4 5-7 7-6(1),5,R64,,,,,,,,,,,,,,,,,,,,113.0,513.0,123.0,477.0,1738.933668,1814.297562,1698.253682,1785.658807,123.0,113.0,3.03,1.44,0.322148,0.677852,4.0,4.0,7.0,6.0,,6.0,6.0,5.0,7.0,,1.0,0.0,1.0,-2.0,2.0,-2.0,2.0,2.0,-2.0,-1.0,1.0,,,,,,,,,21.0,24.0,0,,,


In [67]:
all_data_odds.loc[all_data_odds['round'] == 'RR']

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,p1_id,p1_seed,p1_entry,p1_name,p1_hand,p1_ht,p1_ioc,p1_age,p2_id,p2_seed,p2_entry,p2_name,p2_hand,p2_ht,p2_ioc,p2_age,score,best_of,round,minutes,p1_ace,p1_df,p1_svpt,p1_1stIn,p1_1stWon,p1_2ndWon,p1_SvGms,p1_bpSaved,p1_bpFaced,p2_ace,p2_df,p2_svpt,p2_1stIn,p2_1stWon,p2_2ndWon,p2_SvGms,p2_bpSaved,p2_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,p1_elo,p2_elo,p1_surface_elo,p2_surface_elo,winner_rank_odds,loser_rank_odds,p1_Max,p2_Max,p1_winprob,p2_winprob,p1_set1,p1_set2,p1_set3,p1_set4,p1_set5,p2_set1,p2_set2,p2_set3,p2_set4,p2_set5,tiebreak_occurred,p1_tiebreaks_won,p2_tiebreaks_won,p1_margin1,p2_margin1,p1_margin2,p2_margin2,p1_margin3,p2_margin3,p1_margin4,p2_margin4,p1_margin5,p2_margin5,p1_bpconv,p2_bpconv,p1_bp/rg,p2_bp/rg,p1_tpw,p2_tpw,p1_sw,p2_sw,RESULT,0,1,2
33,433,Las Vegas,Hard,32.0,A,2007-02-26,504.0,103821,,Q,Danai Udomchoke,R,173.0,THA,25.5,104269,5.0,,Fernando Verdasco,L,188.0,ESP,23.2,6-1 5-7 6-2,3,RR,99.0,4.0,7.0,87.0,53.0,30.0,15.0,13.0,6.0,12.0,6.0,6.0,80.0,51.0,40.0,12.0,14.0,0.0,3.0,83.0,488.0,33.0,910.0,1806.053177,1897.784269,1728.009389,1838.294744,33.0,83.0,3.400,1.431,0.314436,0.685564,1.0,7.0,2.0,,,6.0,5.0,6.0,,,0.0,,,-5.0,5.0,2.0,-2.0,-4.0,4.0,,,,,3.0,6.0,0.214286,0.923077,73.0,94.0,10.0,17.0,0,,,
514,605,Masters Cup,Hard,8.0,F,2002-11-11,3.0,103720,,,Lleyton Hewitt,R,180.0,AUS,21.7,103498,,,Marat Safin,R,193.0,RUS,22.7,6-4 2-6 6-4,3,RR,136.0,3.0,4.0,92.0,38.0,28.0,29.0,14.0,4.0,7.0,12.0,2.0,92.0,50.0,36.0,19.0,14.0,3.0,6.0,1.0,3835.0,3.0,2845.0,2202.248048,2106.542666,2146.008788,1991.953488,1.0,3,1.571,2.500,0.588851,0.411149,6.0,2.0,6.0,,,4.0,6.0,4.0,,,0.0,,,2.0,-2.0,-4.0,4.0,2.0,-2.0,,,,,3.0,3.0,0.428571,0.500000,94.0,90.0,14.0,14.0,1,,,
515,605,Masters Cup,Hard,8.0,F,2004-11-15,2.0,103819,,,Roger Federer,R,185.0,SUI,23.2,103720,,,Lleyton Hewitt,R,180.0,AUS,23.7,6-3 6-4,3,RR,76.0,6.0,0.0,63.0,39.0,28.0,17.0,10.0,3.0,3.0,6.0,2.0,60.0,39.0,27.0,9.0,9.0,4.0,6.0,1.0,5585.0,3.0,3190.0,2381.320272,2208.167250,2272.937626,2181.809687,1.0,3.0,1.308,3.750,0.741400,0.258600,6.0,6.0,,,,3.0,4.0,,,,0.0,,,3.0,-3.0,2.0,-2.0,,,,,,,2.0,0.0,0.666667,0.300000,69.0,54.0,12.0,7.0,1,,,
517,605.0,Tour Finals,Hard,8.0,F,2010-11-21,11.0,104925,,,Novak Djokovic,R,188.0,SRB,23.5,104745,,,Rafael Nadal,L,185.0,ESP,24.4,7-5 6-2,3,RR,112.0,7.0,1.0,61.0,39.0,25.0,10.0,10.0,4.0,8.0,6.0,0.0,62.0,33.0,24.0,18.0,10.0,4.0,5.0,3.0,5635.0,1.0,11450.0,2290.106756,2389.314068,2236.932066,2211.566289,1.0,3.0,1.900,2.090,0.530457,0.469543,5.0,2.0,,,,7.0,6.0,,,,0.0,,,-2.0,2.0,-4.0,4.0,,,,,,,1.0,4.0,0.500000,0.800000,55.0,68.0,7.0,13.0,0,605.0,0605,
518,605.0,Tour Finals,Hard,8.0,F,2012-11-05,505.0,104925,1.0,,Novak Djokovic,R,188.0,SRB,25.4,104918,3.0,,Andy Murray,R,191.0,GBR,25.4,4-6 6-3 7-5,3,RR,155.0,4.0,0.0,94.0,57.0,45.0,20.0,16.0,5.0,7.0,9.0,1.0,88.0,52.0,37.0,21.0,15.0,4.0,7.0,1.0,11420.0,3.0,7600.0,2445.684880,2363.299833,2365.354342,2270.077867,1.0,3.0,1.830,2.100,0.533163,0.466837,4.0,6.0,7.0,,,6.0,3.0,5.0,,,0.0,,,-2.0,2.0,3.0,-3.0,2.0,-2.0,,,,,3.0,2.0,0.466667,0.437500,95.0,87.0,17.0,14.0,1,605.0,0605,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27486,433,Las Vegas,Hard,32.0,A,2007-02-26,524.0,103852,,,Feliciano Lopez,L,188.0,ESP,25.4,103917,,,Nicolas Mahut,R,190.0,FRA,25.0,3-6 6-3 7-6(3),3,RR,117.0,5.0,5.0,100.0,60.0,42.0,26.0,15.0,4.0,5.0,4.0,7.0,97.0,51.0,38.0,29.0,15.0,5.0,6.0,96.0,446.0,78.0,500.0,1856.404596,1799.806263,1785.952945,1733.855752,96.0,78.0,1.613,2.530,0.610669,0.389331,3.0,6.0,7.0,,,6.0,3.0,6.0,,,1.0,1.0,0.0,-3.0,3.0,3.0,-3.0,1.0,-1.0,,,,,1.0,1.0,0.400000,0.333333,98.0,99.0,16.0,15.0,1,,,
27487,433,Las Vegas,Hard,32.0,A,2007-02-26,530.0,103401,,,Jan Hernych,R,190.0,CZE,27.6,103794,8.0,,Benjamin Becker,R,178.0,GER,25.6,6-3 7-5,3,RR,62.0,0.0,1.0,56.0,39.0,29.0,9.0,11.0,0.0,2.0,1.0,7.0,72.0,43.0,25.0,13.0,10.0,6.0,10.0,84.0,483.0,40.0,798.0,1779.192414,1857.512198,1709.387318,1816.126701,84.0,40.0,3.190,1.480,0.307726,0.692274,6.0,7.0,,,,3.0,5.0,,,,0.0,,,3.0,-3.0,2.0,-2.0,,,,,,,4.0,2.0,1.000000,0.181818,72.0,56.0,13.0,8.0,1,,,
27490,433,Las Vegas,Hard,32.0,A,2007-02-26,531.0,103794,8.0,,Benjamin Becker,R,178.0,GER,25.6,102856,,WC,Gustavo Kuerten,R,190.0,BRA,30.4,6-4 6-3,3,RR,65.0,9.0,4.0,63.0,32.0,27.0,17.0,10.0,1.0,1.0,4.0,1.0,55.0,30.0,22.0,14.0,9.0,4.0,6.0,40.0,798.0,804.0,16.0,1842.421304,1869.501371,1797.790616,1912.660815,40.0,804.0,1.700,2.370,0.585185,0.414815,6.0,6.0,,,,4.0,3.0,,,,0.0,,,2.0,-2.0,3.0,-3.0,,,,,,,2.0,0.0,0.666667,0.100000,63.0,55.0,12.0,7.0,1,,,
27491,433,Las Vegas,Hard,32.0,A,2007-02-26,532.0,103401,,,Jan Hernych,R,190.0,CZE,27.6,102856,,WC,Gustavo Kuerten,R,190.0,BRA,30.4,6-4 6-4,3,RR,76.0,5.0,4.0,68.0,43.0,34.0,9.0,10.0,7.0,9.0,6.0,2.0,72.0,47.0,33.0,7.0,10.0,9.0,13.0,84.0,483.0,804.0,16.0,1799.347654,1856.603383,1740.778384,1898.250328,84.0,804.0,1.510,2.850,0.650463,0.349537,6.0,6.0,,,,4.0,4.0,,,,0.0,,,2.0,-2.0,2.0,-2.0,,,,,,,4.0,2.0,1.300000,0.900000,75.0,65.0,12.0,8.0,1,,,
