# Tennis Machine Learning Model: 
Training a model on tennis matches to predict future ones.

## Preprocess data

Read data and set dates to datetime format.

In [157]:
import pandas as pd
from glob import glob

# adjust path or filenames to where you saved the CSVs
files = sorted(glob("MatchCSVs/atp_matches_*.csv"))[:5] # Finds all files matching the pattern, sorts them, and takes the first 2 as an example
dfs = [pd.read_csv(f) for f in files] # Read each CSV file into a DataFrame
df = pd.concat(dfs, ignore_index=True) # Concatenate all DataFrames into one, resetting the index

# Quick peek
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
display(df.head())

# Make sure dates are in datetime format (if not already)
if 'tourney_date' in df.columns:
    df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d', errors='coerce')
else:
    # fallback
    df['tourney_date'] = pd.to_datetime(df['date'], errors='coerce')

print(df['tourney_date'].min(), "->", df['tourney_date'].max())

Rows: 13174
Columns: ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level', 'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry', 'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points']


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2020-8888,Atp Cup,Hard,24,A,20200106,300,104925,,,Novak Djokovic,R,188.0,SRB,32.6,104745,,,Rafael Nadal,L,185.0,ESP,33.5,6-2 7-6(4),3,F,115.0,12.0,2.0,64.0,49.0,41.0,9.0,10.0,5.0,5.0,5.0,3.0,70.0,51.0,39.0,6.0,10.0,6.0,8.0,2.0,9055.0,1.0,9985.0
1,2020-8888,Atp Cup,Hard,24,A,20200106,299,105138,,,Roberto Bautista Agut,R,183.0,ESP,31.7,105583,,,Dusan Lajovic,R,183.0,SRB,29.5,7-5 6-1,3,F,97.0,2.0,1.0,59.0,44.0,29.0,10.0,10.0,3.0,5.0,2.0,1.0,57.0,35.0,21.0,6.0,9.0,5.0,10.0,10.0,2335.0,34.0,1251.0
2,2020-8888,Atp Cup,Hard,24,A,20200106,298,104925,,,Novak Djokovic,R,188.0,SRB,32.6,106421,,,Daniil Medvedev,R,198.0,RUS,23.9,6-1 5-7 6-4,3,SF,167.0,4.0,5.0,111.0,75.0,53.0,16.0,15.0,8.0,11.0,6.0,5.0,108.0,57.0,35.0,25.0,14.0,6.0,11.0,2.0,9055.0,5.0,5705.0
3,2020-8888,Atp Cup,Hard,24,A,20200106,297,105583,,,Dusan Lajovic,R,183.0,SRB,29.5,111575,,,Karen Khachanov,R,198.0,RUS,23.6,7-5 7-6(1),3,SF,108.0,1.0,1.0,67.0,48.0,38.0,14.0,12.0,0.0,0.0,9.0,3.0,79.0,54.0,39.0,14.0,12.0,0.0,1.0,34.0,1251.0,17.0,1840.0
4,2020-8888,Atp Cup,Hard,24,A,20200106,296,104745,,,Rafael Nadal,L,185.0,ESP,33.5,200282,,,Alex De Minaur,R,183.0,AUS,20.8,4-6 7-5 6-1,3,SF,133.0,5.0,3.0,84.0,61.0,48.0,10.0,15.0,1.0,3.0,6.0,1.0,75.0,55.0,37.0,10.0,14.0,1.0,5.0,1.0,9985.0,18.0,1775.0


2020-01-06 00:00:00 -> 2024-12-18 00:00:00


Drop walkover matches, rows with missing data, and sort chronologically.

In [158]:
# drop problem matches (RET, walkover)
if 'score' in df.columns:
    df = df[~df['score'].str.contains('RET|Walkover|WO|ret', case=False, na=False)] # Remove rows with RET, Walkover, or similar in the score column

# drop rows with missing key data
df = df.dropna(subset=['winner_id','loser_id','tourney_date'])

# sort chronologically (important for time-based features)
df = df.sort_values('tourney_date').reset_index(drop=True)

Create a player-event table for rolling stats (i.e. gather all information before a certain match to use in training)
Build a long "events" table where each match produces two player-events, then compute rolling aggregates shifted so they use only past matches.

In [159]:
events = []
for idx, r in df.iterrows():
    # Winner event
    d = r['tourney_date']
    s = r.get('surface', None)
    events.append({
        'player_name': r['winner_name'],
        'player_id': r['winner_id'],
        'opponent_id': r['loser_id'],
        'date': d,
        'surface': s,
        'is_win': 1,
        'match_id': idx
        # add other stats as needed, e.g. aces, double faults, etc.
    })
    # Loser event
    events.append({
        'player_name': r['loser_name'],
        'player_id': r['loser_id'],
        'opponent_id': r['winner_id'],
        'date': d,
        'surface': s,
        'is_win': 0,
        'match_id': idx
        # add other stats as needed, e.g. aces, double faults, etc.
    })

events_df = pd.DataFrame(events)

events_df = events_df.sort_values('match_id').reset_index(drop=True) 

display(events_df.head())

events_df = events_df.sort_values(['player_id', 'date']).reset_index(drop=True) 

display(events_df.head())


Unnamed: 0,player_name,player_id,opponent_id,date,surface,is_win,match_id
0,Novak Djokovic,104925,104745,2020-01-06,Hard,1,0
1,Rafael Nadal,104745,104925,2020-01-06,Hard,0,0
2,John Millman,105357,133018,2020-01-06,Hard,1,1
3,Michail Pervolarakis,133018,105357,2020-01-06,Hard,0,1
4,Nick Kyrgios,106401,126774,2020-01-06,Hard,1,2


Unnamed: 0,player_name,player_id,opponent_id,date,surface,is_win,match_id
0,Alexander Zverev,100644,200282,2020-01-06,Hard,0,6
1,Alexander Zverev,100644,133430,2020-01-06,Hard,0,8
2,Alexander Zverev,100644,126774,2020-01-06,Hard,0,10
3,Alexander Zverev,100644,106078,2020-01-20,Hard,1,195
4,Alexander Zverev,100644,106233,2020-01-20,Hard,0,202


Functions to collect data leading up to a match, including:
- Number of wins in previous n matches
- Total matches played
- Total wins
- Total losses

- Total matches on surface
- Total wins on surface
- Total losses on surface

In [160]:
# wins in last n matches before current match
def wins_last_n(events_df, n=5, colname=None):
    if colname is None:
        colname = f'wins_last_{n}'
    # Group by player_id and calculate rolling sum of wins
    events_df[colname] = (events_df.groupby('player_id')['is_win'] # Group by player_id
                          .rolling(window=n, min_periods=1) # Rolling window of size n (i.e. look at last n matches)
                          .sum() # Sum wins in the window
                          .shift(1) # Shift to use only past matches
                          .reset_index(level=0, drop=True) # Reset index to align with original DataFrame
                          )
    return events_df

# total matches played before current match
def total_matches_before(events_df, colname=None):
    if colname is None:
        colname = 'total_matches_before'
    # Group by player_id and calculate rolling count of matches
    events_df[colname] = events_df.groupby('player_id')['match_id'].cumcount()
    return events_df

# total wins before current match
def total_wins_before(events_df, colname=None):
    if colname is None:
        colname = 'total_wins_before'
    # Group by player_id and calculate cumulative sum of wins
    events_df[colname] = (events_df.groupby('player_id')['is_win']
                          .cumsum()
                          .shift(2)  # Shift to use only past matches
                          .fillna(0)  # Fill NaN with 0 for players with no previous wins
                          .astype(int))  # Convert to integer
    return events_df

# total losses before current match
def total_losses_before(events_df, colname=None):
    if colname is None:
        colname = 'total_losses_before'
    # Group by player_id and calculate cumulative sum of losses
    events_df[colname] = (events_df.groupby('player_id')['is_win']
                          .transform(lambda x: (x == 0).cumsum().shift(1))  # Count losses and shift to use only past matches
                          .fillna(0)  # Fill NaN with 0 for players with no previous losses
                          .astype(int))
    return events_df

# total matches on same surface before current match
'''
# ...existing code...
def total_matches_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'total_matches_on_surface'

    # Group by (player_id, surface) and count prior occurrences
    events_df['surface_filled'] = events_df['surface'].fillna('Unknown')
    events_df[colname] = (
        events_df.groupby(['player_id', 'surface_filled']).cumcount()
    ).astype(int)
    events_df.drop('surface_filled', axis=1, inplace=True)

    return events_df
# ...existing code...
'''

def total_matches_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'total_matches_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')

   
    # Group by player_id and surface, calculate cumulative count of matches
    events_df[colname] = (events_df.groupby('player_surface')['match_id']
                            .cumcount()
                            #.shift(1)  # Shift to use only past matches
                            .fillna(0)  # Fill NaN with 0 for players with no previous matches on that surface
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

# wins on same surface before current match
def wins_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'wins_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')
    
    # Group by player_id and surface, calculate cumulative sum of wins
    events_df[colname] = (events_df.groupby('player_surface')['is_win']
                            .cumsum()
                            #.shift(1)
                            .fillna(0)
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

# losses on same surface before current match
def losses_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'losses_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')
    
    # Group by player_id and surface, calculate cumulative sum of losses
    events_df[colname] = (events_df.groupby('player_surface')['is_win']
                            .transform(lambda x: (x == 0).cumsum().shift(1))
                            .fillna(0)
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

Run all the functions to update events_df DataFrame

In [161]:
events_df = wins_last_n(events_df, n=5)
events_df = total_matches_before(events_df)
events_df = total_wins_before(events_df)
events_df = total_losses_before(events_df)
events_df = total_matches_on_surface(events_df)
events_df = wins_on_surface(events_df)
events_df = losses_on_surface(events_df)

Merge pre-match rolling columns from events_df back into original df

In [162]:
# Ensure df has a stable match_id that matches what was used when building events (idx in iterrows)
df['match_id'] = df.index
pd.set_option('display.max_columns', None)

# List of feature columns created in events_df (adjust if you add more functions)
feature_cols = [
    'wins_last_5',
    'total_matches_before',
    'total_wins_before',
    'total_losses_before',
    'total_matches_on_surface',
    'wins_on_surface',
    'losses_on_surface'
]

# Subset DF with only what we need
ev = events_df[['match_id', 'player_id', 'is_win'] + feature_cols]

# Split ev into winners and losers (ev contains both winner and loser rows for each match)
w = ev[ev.is_win == 1].copy() # Boolean mask for winners
l = ev[ev.is_win == 0].copy() # Boolean mask for lasers

# Rename feature columns with side prefixes
w.rename(columns={c: f"winner_{c}" for c in feature_cols}, inplace=True)
l.rename(columns={c: f"loser_{c}" for c in feature_cols}, inplace=True)

# Rename player_id columns wiht side prefixes
w.rename(columns={'player_id': 'winner_player_id'}, inplace=True)
l.rename(columns={'player_id': 'loser_player_id'}, inplace=True)

# Drop helper flag
w.drop(columns=['is_win'], inplace=True)
l.drop(columns=['is_win'], inplace=True)

# Merge onto match-level df
df_with_stats = df.merge(w, on='match_id', how='left')
df_with_stats = df_with_stats.merge(l, on='match_id', how='left')

# Sanity check: winner_player_id should equal winner_id etc.
assert (df_with_stats['winner_player_id'] == df['winner_id']).all()
assert (df_with_stats['loser_player_id'] == df['loser_id']).all()


# Show Djokovic only (104925)
pid = 106401
hist = (events_df[events_df['player_id'] == pid]
        .sort_values('match_id')
        [['match_id','player_name', 'is_win'] + feature_cols])
hist = hist.assign(is_loss=(hist.is_win == 0).astype(int),
                   cum_losses=hist['is_win'].eq(0).cumsum())
display(hist.head())


# Create difference features (winner minus loser) – often useful for modeling
for c in feature_cols:
    df_with_stats[f'diff_{c}'] = df_with_stats[f'winner_{c}'] - df_with_stats[f'loser_{c}']
df_with_stats.sort_values('match_id').reset_index(drop=True)
display(df_with_stats.head(20))

Unnamed: 0,match_id,player_name,is_win,wins_last_5,total_matches_before,total_wins_before,total_losses_before,total_matches_on_surface,wins_on_surface,losses_on_surface,is_loss,cum_losses
9155,2,Nick Kyrgios,1,1.0,0,23,0,0,1,0,0,0
9156,5,Nick Kyrgios,1,1.0,1,23,0,1,2,0,0,0
9157,72,Nick Kyrgios,1,2.0,2,1,0,2,3,0,0,0
9158,78,Nick Kyrgios,0,3.0,3,2,0,3,3,0,1,1
9159,193,Nick Kyrgios,1,3.0,4,3,1,4,4,1,0,1


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,match_id,winner_player_id,winner_wins_last_5,winner_total_matches_before,winner_total_wins_before,winner_total_losses_before,winner_total_matches_on_surface,winner_wins_on_surface,winner_losses_on_surface,loser_player_id,loser_wins_last_5,loser_total_matches_before,loser_total_wins_before,loser_total_losses_before,loser_total_matches_on_surface,loser_wins_on_surface,loser_losses_on_surface,diff_wins_last_5,diff_total_matches_before,diff_total_wins_before,diff_total_losses_before,diff_total_matches_on_surface,diff_wins_on_surface,diff_losses_on_surface
0,2020-8888,Atp Cup,Hard,24,A,2020-01-06,300,104925,,,Novak Djokovic,R,188.0,SRB,32.6,104745,,,Rafael Nadal,L,185.0,ESP,33.5,6-2 7-6(4),3,F,115.0,12.0,2.0,64.0,49.0,41.0,9.0,10.0,5.0,5.0,5.0,3.0,70.0,51.0,39.0,6.0,10.0,6.0,8.0,2.0,9055.0,1.0,9985.0,0,104925,0.0,0,0,0,0,1,0,104745,2.0,0,25,0,0,0,0,-2.0,0,-25,0,0,1,0
1,2020-8888,Atp Cup,Hard,24,A,2020-01-06,217,105357,,,John Millman,R,183.0,AUS,30.5,133018,,,Michail Pervolarakis,R,193.0,GRE,23.5,4-6 6-1 7-6(1),3,RR,147.0,12.0,3.0,86.0,56.0,44.0,21.0,15.0,4.0,5.0,1.0,5.0,98.0,49.0,38.0,23.0,14.0,2.0,4.0,48.0,1026.0,486.0,62.0,1,105357,0.0,0,0,0,0,1,0,133018,0.0,0,38,0,0,0,0,0.0,0,-38,0,0,1,0
2,2020-8888,Atp Cup,Hard,24,A,2020-01-06,218,106401,,,Nick Kyrgios,R,193.0,AUS,24.6,126774,,,Stefanos Tsitsipas,R,193.0,GRE,21.4,7-6(7) 6-7(3) 7-6(5),3,RR,154.0,25.0,3.0,113.0,83.0,67.0,20.0,18.0,2.0,2.0,18.0,2.0,123.0,85.0,75.0,15.0,18.0,1.0,1.0,29.0,1375.0,6.0,5300.0,2,106401,1.0,0,23,0,0,1,0,126774,3.0,0,2,0,0,0,0,-2.0,0,21,0,0,1,0
3,2020-8888,Atp Cup,Hard,24,A,2020-01-06,219,200000,,,Felix Auger Aliassime,R,193.0,CAN,19.4,133018,,,Michail Pervolarakis,R,193.0,GRE,23.5,6-1 6-3,3,RR,69.0,5.0,4.0,42.0,26.0,22.0,10.0,8.0,0.0,0.0,0.0,1.0,46.0,18.0,12.0,12.0,8.0,6.0,10.0,21.0,1636.0,486.0,62.0,3,200000,1.0,0,1,0,0,1,0,133018,0.0,1,0,1,1,0,1,1.0,-1,1,-1,-1,1,-1
4,2020-8888,Atp Cup,Hard,24,A,2020-01-06,220,133430,,,Denis Shapovalov,L,185.0,CAN,20.7,126774,,,Stefanos Tsitsipas,R,193.0,GRE,21.4,7-6(6) 7-6(4),3,RR,123.0,12.0,4.0,92.0,61.0,48.0,17.0,12.0,4.0,4.0,3.0,3.0,75.0,47.0,40.0,17.0,12.0,1.0,1.0,14.0,2050.0,6.0,5300.0,4,133430,2.0,0,14,0,0,1,0,126774,0.0,1,3,1,1,0,1,2.0,-1,11,-1,-1,1,-1
5,2020-8888,Atp Cup,Hard,24,A,2020-01-06,221,106401,,,Nick Kyrgios,R,193.0,AUS,24.6,105526,,,Jan Lennard Struff,R,193.0,GER,29.7,6-4 7-6(4),3,RR,72.0,20.0,1.0,63.0,46.0,40.0,11.0,11.0,0.0,0.0,15.0,3.0,57.0,38.0,36.0,10.0,11.0,0.0,1.0,29.0,1375.0,35.0,1245.0,5,106401,1.0,1,23,0,1,2,0,105526,0.0,0,1,0,0,0,0,1.0,1,22,0,1,2,0
6,2020-8888,Atp Cup,Hard,24,A,2020-01-06,222,200282,,,Alex De Minaur,R,183.0,AUS,20.8,100644,,,Alexander Zverev,R,198.0,GER,22.7,4-6 7-6(3) 6-2,3,RR,164.0,2.0,0.0,101.0,77.0,52.0,14.0,15.0,7.0,10.0,10.0,14.0,106.0,69.0,52.0,12.0,15.0,8.0,12.0,18.0,1775.0,7.0,3345.0,6,200282,1.0,0,2,0,0,1,0,100644,,0,0,0,0,0,0,,0,2,0,0,1,0
7,2020-8888,Atp Cup,Hard,24,A,2020-01-06,223,105526,,,Jan Lennard Struff,R,193.0,GER,29.7,200000,,,Felix Auger Aliassime,R,193.0,CAN,19.4,6-1 6-4,3,RR,74.0,6.0,2.0,48.0,26.0,22.0,16.0,9.0,1.0,1.0,7.0,2.0,53.0,36.0,23.0,8.0,8.0,4.0,7.0,35.0,1245.0,21.0,1636.0,7,105526,0.0,1,0,1,1,1,1,200000,1.0,1,1,0,1,1,0,-1.0,0,-1,1,0,0,1
8,2020-8888,Atp Cup,Hard,24,A,2020-01-06,224,133430,,,Denis Shapovalov,L,185.0,CAN,20.7,100644,,,Alexander Zverev,R,198.0,GER,22.7,6-2 6-2,3,RR,70.0,12.0,5.0,52.0,33.0,30.0,8.0,8.0,2.0,2.0,2.0,7.0,52.0,28.0,19.0,7.0,8.0,1.0,5.0,14.0,2050.0,7.0,3345.0,8,133430,1.0,1,14,0,1,2,0,100644,0.0,1,0,1,1,0,1,1.0,0,14,-1,0,2,-1
9,2020-8888,Atp Cup,Hard,24,A,2020-01-06,225,105526,,,Jan Lennard Struff,R,193.0,GER,29.7,133018,,,Michail Pervolarakis,R,193.0,GRE,23.5,6-4 6-1,3,RR,69.0,8.0,2.0,55.0,37.0,27.0,10.0,9.0,2.0,3.0,3.0,4.0,49.0,24.0,11.0,14.0,8.0,4.0,8.0,35.0,1245.0,486.0,62.0,9,105526,1.0,2,0,1,2,2,1,133018,0.0,2,0,2,2,0,2,1.0,0,0,-1,0,2,-1
