# Data Preprocessing
## Objectives
- label encoding - convert to numerical categories
- symmetry handling - ensure dataset contains equally as many rows where player A wins as where player B wins - this is achievable by duplicating each row, but swapping the A-B metrics.
- prevent data leakage - Avoid future stats used as features
## Key derived metrics
- elo Rating - relative skill level calculation
- breakpoint conversion - ability to capitalise on high leverage
- service points win % - ability to consistently win when provided an advantage
- return points win % - represents defensive ability
- head-to-head - 'past track record'
- surface-specific win % - different surfaces play differently - filtered specifically for the current match surface
- win % (over last 30 days) - useful for capturing momentum/tiredness

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:

matches = pd.concat([pd.read_csv(f'../tennis_atp/atp_matches_{year}.csv') for year in range(2020, 2025)])

matches.columns = [
    "tournament_id",
    "tournament_name",
    "surface",
    "draw_size",
    "tournament_level",
    "tournament_date",
    "match_number",
    "winner_id",
    "winner_seed",
    "winner_entry_type",
    "winner_name",
    "winner_hand",
    "winner_height_cm",
    "winner_country_code",
    "winner_age",
    "loser_id",
    "loser_seed",
    "loser_entry_type",
    "loser_name",
    "loser_hand",
    "loser_height_cm",
    "loser_country_code",
    "loser_age",
    
    # Final match score
    "score",

    # Best of how many sets (usually 3/4/5)
    "best_of_sets",

    # Which round of tournament? (F, QF, Q1, Q2, R16, R32)
    "round",

    # How long was the math?
    "match_duration_minutes",
    
    # Winner/loser match stats
    "winner_aces",
    "winner_double_faults",
    "winner_service_points_played",
    "winner_first_serves_in",
    "winner_first_serve_points_won",
    "winner_second_serve_points_won",
    "winner_service_games_played",
    "winner_break_points_saved",
    "winner_break_points_faced",

    "loser_aces",
    "loser_double_faults",
    "loser_service_points_played",
    "loser_first_serves_in",
    "loser_first_serve_points_won",
    "loser_second_serve_points_won",
    "loser_service_games_played",
    "loser_break_points_saved",
    "loser_break_points_faced",
    
    # Rank and rank points
    "winner_rank",
    "winner_rank_points",
    "loser_rank",
    "loser_rank_points"
]

matches['tournament_date'] = pd.to_datetime(matches['tournament_date'], format='%Y%m%d')

In [37]:
matches['unique_match_id'] = matches['tournament_id'] + '_' + matches['match_number'].astype(str)

# Breakpoint conversion stats - these depend on the opposite player's metrics, so have to be added earlier in the data pipeline
matches['winner_bp_conversion'] = (matches['loser_break_points_faced'] - matches['loser_break_points_saved']) / matches['loser_break_points_faced']
matches['loser_bp_conversion'] = (matches['winner_break_points_faced'] - matches['winner_break_points_saved']) / matches['winner_break_points_faced']

In [38]:
# Elo statistics - this assumes that matches are sorted by date
def calculate_elo_features(matches, k_factor):
    # Initial Elo
    START_ELO = 1500

    current_elos = {}

    # This refers to the previous elo ranking (before the match)
    winner_elos = []
    loser_elos = []

    for _, row in matches.iterrows():
        winner_id, loser_id = row['winner_id'], row['loser_id']

        # Elos start at 1500 traditionally
        winner_elo = current_elos.get(winner_id, START_ELO)
        loser_elo = current_elos.get(loser_id, START_ELO)
        winner_elos.append(winner_elo)
        loser_elos.append(loser_elo)

        # A sigmoidal (S-shaped) win probability prediction (a form of simplistic logistic regression)
        win_prob = 1 / (1 + 10 ** ((loser_elo - winner_elo) / 400))

        # The k-factor (the maximum rating change posssible) by the win probability
        # I.e. if the winner was predicted as unlikely to do win, their rating change is much more significant
        rating_change = k_factor * (1 - win_prob)

        # Update Elos
        current_elos[winner_id] = winner_elo + rating_change
        current_elos[loser_id] = loser_elo - rating_change

    matches['winner_elo'] = winner_elos
    matches['loser_elo'] = loser_elos

# Sort the matches to prevent leakage and ensure correct calculations
matches.sort_values(['tournament_date', 'match_number'], inplace=True)
calculate_elo_features(matches, 40)

In [39]:
# The 'wide data format' - per-match, cana be converted to
# 'long data format', where each player performance in each match is a separate row
# This allows for efficient computation of rolling averages.
def convert_matches_to_long_data(matches):
    neutral_columns = ['unique_match_id', 'tournament_date', 'surface', 'tournament_level', 'best_of_sets']
    
    winner_columns = [column for column in matches.columns if 'winner' in column] + neutral_columns
    winners = matches[winner_columns].copy()
    winners.columns = [column.replace('winner_', '') for column in winners.columns]
    winners['outcome'] = 1

    loser_columns = [column for column in matches.columns if 'loser' in column] + neutral_columns
    losers = matches[loser_columns].copy()
    losers.columns = [column.replace('loser_', '') for column in losers.columns]
    losers['outcome'] = 0

    return pd.concat([winners, losers]).sort_values(['tournament_date', 'unique_match_id'])

# Generate long data
long_data = convert_matches_to_long_data(matches)

In [40]:
long_data['serve_win_ratio'] = (long_data['first_serve_points_won'] + long_data['second_serve_points_won']) / long_data['service_points_played']

In [41]:
# Rolling statistics:
long_data['rolling_serve_win_ratio'] = long_data.groupby('id')['serve_win_ratio'] \
                                                  .transform(lambda x : x.shift(1).rolling(20, min_periods=5).mean())

long_data['surface_win_ratio'] = long_data.groupby(['id', 'surface'])['outcome'] \
                                          .transform(lambda x : x.shift(1).rolling(20, min_periods=1).mean())



In [42]:
# the numerical stats which have 'nan' values to correct by averaging


numeric_nan_stats = [
    'age',
    'aces',
    'double_faults',
    'service_points_played',
    'first_serves_in',
    'first_serve_points_won',
    'second_serve_points_won',
    'service_games_played',
    'break_points_saved',
    'break_points_faced',
    'rank',
    'rank_points',
    'bp_conversion',
    'serve_win_ratio',
    'rolling_serve_win_ratio', 
    'surface_win_ratio', 
    'height_cm'
]
long_data[numeric_nan_stats] = long_data[numeric_nan_stats].fillna(long_data[numeric_nan_stats].mean())

# update seed stats - we don't care about a certain seed, only about whether we have one or not - convert to integer
long_data['has_seed'] = long_data['seed'].isna().replace(to_replace=[True, False], value=[0, 1])
long_data = long_data.drop('seed', axis=1) # Seed value is unimportant - whether or not a player has it is what matters

In [43]:
long_data

Unnamed: 0,id,entry_type,name,hand,height_cm,country_code,age,aces,double_faults,service_points_played,...,unique_match_id,tournament_date,surface,tournament_level,best_of_sets,outcome,serve_win_ratio,rolling_serve_win_ratio,surface_win_ratio,has_seed
112,105062,,Mikhail Kukushkin,R,183.0,KAZ,32.0,7.0,0.0,51.0,...,2020-0451_271,2020-01-06,Hard,A,3,1,0.666667,0.641031,0.528298,0
112,104291,WC,Malek Jaziri,R,185.0,TUN,35.9,3.0,2.0,44.0,...,2020-0451_271,2020-01-06,Hard,A,3,0,0.431818,0.641031,0.528298,0
111,105732,,Pierre Hugues Herbert,R,188.0,FRA,28.8,6.0,2.0,51.0,...,2020-0451_272,2020-01-06,Hard,A,3,1,0.745098,0.641031,0.528298,0
111,106065,WC,Marco Cecchinato,R,185.0,ITA,27.2,2.0,1.0,59.0,...,2020-0451_272,2020-01-06,Hard,A,3,0,0.559322,0.641031,0.528298,0
110,111513,,Laslo Djere,R,188.0,SRB,24.5,5.0,2.0,73.0,...,2020-0451_273,2020-01-06,Hard,A,3,1,0.643836,0.641031,0.528298,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2803,210506,,Alex Michelsen,R,193.0,USA,20.3,7.0,0.0,73.0,...,2024-7696_398,2024-12-18,Hard,F,5,0,0.616438,0.640365,0.650000,1
2802,211663,,Joao Fonseca,R,185.0,BRA,18.3,6.0,0.0,38.0,...,2024-7696_399,2024-12-18,Hard,F,5,1,0.842105,0.642088,0.833333,1
2802,209414,,Luca Van Assche,R,178.0,FRA,20.5,7.0,2.0,51.0,...,2024-7696_399,2024-12-18,Hard,F,5,0,0.509804,0.604532,0.400000,1
2801,211663,,Joao Fonseca,R,185.0,BRA,18.3,8.0,0.0,63.0,...,2024-7696_400,2024-12-18,Hard,F,5,1,0.777778,0.652615,0.857143,1


## Label Encoding

In [44]:

# This dictionary is used to map string categories to an integer
entry_rank_map = {
    np.nan: 0,   
    'SE': 1,     # Special Exempt (High Momentum)
    'PR': 2,     # Protected Ranking
    'UP': 2,     # Used Protection (similar to PR)
    'Q': 3,      # Qualifier
    'WC': 4,     # Wild Card
    'LL': 5,     # Lucky Loser
    'Alt': 6,    # Alternate
    'ALT': 6,    # Alternate (capitalisation variant)
    'ITF': 6,    # ITF World Tennis Tour entry
    'W': 6       # Winner placeholder/lower-tier entry
}

long_data['entry_type'] = long_data['entry_type'].map(entry_rank_map)


long_data['hand'] = (long_data['hand'] == 'R').astype(int)

surface_rank_map = {
    np.nan: 0,
    'Hard': 1, # Hard 
    'Clay': 2,
    'Grass': 3
}
long_data['surface'] = long_data['surface'].map(surface_rank_map)\

tournament_level_rank_map = {
    'G': 0, 
    'F': 1, 
    'M': 2, 
    'A': 3, 
    'O': 4, 
    'D': 5
}
long_data['tournament_level'] = long_data['tournament_level'].map(tournament_level_rank_map)

In [45]:
long_data.columns

Index(['id', 'entry_type', 'name', 'hand', 'height_cm', 'country_code', 'age',
       'aces', 'double_faults', 'service_points_played', 'first_serves_in',
       'first_serve_points_won', 'second_serve_points_won',
       'service_games_played', 'break_points_saved', 'break_points_faced',
       'rank', 'rank_points', 'bp_conversion', 'elo', 'unique_match_id',
       'tournament_date', 'surface', 'tournament_level', 'best_of_sets',
       'outcome', 'serve_win_ratio', 'rolling_serve_win_ratio',
       'surface_win_ratio', 'has_seed'],
      dtype='str')

In [None]:
def convert_long_to_symmetric_wide(long_data):
    neutral_columns = ['unique_match_id', 'tournament_date', 'surface', 'tournament_level', 'best_of_sets']
    player_columns = ['name', 'hand', 'height_cm', 'age', 'rank', 'rank_points', 'elo', 
                      'rolling_serve_win_ratio', 'surface_win_ratio', 'has_seed']

    winners = long_data[long_data['outcome'] == 1]
    losers = long_data[long_data['outcome'] == 0]

    v1_winners = winners.rename(columns={col: 'A_' + col for col in player_columns})
    v1_losers = losers.rename(columns={col: 'B_' + col for col in player_columns})
    
    wide_v1 = pd.merge(
        v1_winners[neutral_columns + ['A_' + col for col in player_columns]],
        v1_losers[['unique_match_id'] + ['B_' + col for col in player_columns]],
        on='unique_match_id'
    )
    wide_v1['target'] = 1

    v2_losers = losers.rename(columns={col: 'A_' + col for col in player_columns})
    v2_winners = winners.rename(columns={col: 'B_' + col for col in player_columns})
    
    wide_v2 = pd.merge(
        v2_losers[neutral_columns + ['A_' + col for col in player_columns]],
        v2_winners[['unique_match_id'] + ['B_' + col for col in player_columns]],
        on='unique_match_id'
    )
    wide_v2['target'] = 0

    return pd.concat([wide_v1, wide_v2], axis=0).reset_index(drop=True)

wide_data = convert_long_to_symmetric_wide(long_data)

In [47]:
wide_data.columns

Index(['unique_match_id', 'tournament_date', 'surface', 'tournament_level',
       'best_of_sets', 'A_name', 'A_hand', 'A_height_cm', 'A_age', 'A_rank',
       'A_rank_points', 'A_elo', 'A_rolling_serve_win_ratio',
       'A_surface_win_ratio', 'A_has_seed', 'B_name', 'B_hand', 'B_height_cm',
       'B_age', 'B_rank', 'B_rank_points', 'B_elo',
       'B_rolling_serve_win_ratio', 'B_surface_win_ratio', 'B_has_seed',
       'target'],
      dtype='str')

In [50]:
# For Elo, Rank, Rank Points, Rolling Serve Win Ratio, Rolling Surface Win Ratio,
# Also physical attributes like Age, and Height
# We convert to differentials (rather than absolute values), as the actual values in isolation are unimportant:

differential_columns = ['elo', 'rank', 'rank_points', 'rolling_serve_win_ratio', 'surface_win_ratio', 'age', 'height_cm']

for column in differential_columns:
    wide_data[f'{column}_diff'] = wide_data[f'A_{column}'] - wide_data[f'B_{column}']

    # Drop the original columns
    wide_data = wide_data.drop(f'A_{column}', axis=1)
    wide_data = wide_data.drop(f'B_{column}', axis=1)

wide_data

Unnamed: 0,unique_match_id,tournament_date,surface,tournament_level,best_of_sets,A_name,A_hand,A_has_seed,B_name,B_hand,B_has_seed,target,elo_diff,rank_diff,rank_points_diff,rolling_serve_win_ratio_diff,surface_win_ratio_diff,age_diff,height_cm_diff
0,2020-0451_271,2020-01-06,1,3,3,Mikhail Kukushkin,1,0,Malek Jaziri,1,0,1,0.000000,-163.0,610.0,0.000000,0.000000,-3.9,-2.0
1,2020-0451_272,2020-01-06,1,3,3,Pierre Hugues Herbert,1,0,Marco Cecchinato,1,0,1,0.000000,-10.0,150.0,0.000000,0.000000,1.6,3.0
2,2020-0451_273,2020-01-06,1,3,3,Laslo Djere,1,1,Lorenzo Sonego,1,0,1,0.000000,-12.0,161.0,0.000000,0.000000,-0.1,-3.0
3,2020-0451_275,2020-01-06,1,3,3,Miomir Kecmanovic,1,0,Jordan Thompson,1,0,1,0.000000,-1.0,3.0,0.000000,0.000000,-5.4,0.0
4,2020-0451_276,2020-01-06,1,3,3,Cem Ilkel,1,0,Ricardas Berankis,1,0,1,0.000000,212.0,-660.0,0.000000,0.000000,-5.2,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26343,2024-7696_396,2024-12-18,1,1,5,Arthur Fils,1,1,Learner Tien,0,1,0,215.925657,-102.0,1862.0,0.102469,0.155556,1.5,5.0
26344,2024-7696_397,2024-12-18,1,1,5,Jakub Mensik,1,1,Arthur Fils,1,1,0,-44.763208,28.0,-1219.0,-0.045906,-0.050000,-1.3,8.0
26345,2024-7696_398,2024-12-18,1,1,5,Alex Michelsen,1,1,Learner Tien,0,1,0,135.248187,-81.0,752.0,0.047296,0.150000,1.3,13.0
26346,2024-7696_399,2024-12-18,1,1,5,Luca Van Assche,1,1,Joao Fonseca,1,1,0,-121.043506,-17.0,62.0,-0.037556,-0.433333,2.2,-7.0


In [52]:
wide_data.loc[0]

unique_match_id                       2020-0451_271
tournament_date                 2020-01-06 00:00:00
surface                                           1
tournament_level                                  3
best_of_sets                                      3
A_name                            Mikhail Kukushkin
A_hand                                            1
A_has_seed                                        0
B_name                                 Malek Jaziri
B_hand                                            1
B_has_seed                                        0
target                                            1
elo_diff                                        0.0
rank_diff                                    -163.0
rank_points_diff                              610.0
rolling_serve_win_ratio_diff                    0.0
surface_win_ratio_diff                          0.0
age_diff                                       -3.9
height_cm_diff                                 -2.0
Name: 0, dty

In [53]:
wide_data.columns

Index(['unique_match_id', 'tournament_date', 'surface', 'tournament_level',
       'best_of_sets', 'A_name', 'A_hand', 'A_has_seed', 'B_name', 'B_hand',
       'B_has_seed', 'target', 'elo_diff', 'rank_diff', 'rank_points_diff',
       'rolling_serve_win_ratio_diff', 'surface_win_ratio_diff', 'age_diff',
       'height_cm_diff'],
      dtype='str')

In [58]:
feature_columns = [
    'surface', 
    'tournament_level',
    'best_of_sets', 
    'A_hand', 
    'A_has_seed',
    'B_hand',
    'B_has_seed', 
    'elo_diff', 
    'rank_diff', 
    'rank_points_diff',
    'rolling_serve_win_ratio_diff', 
    'surface_win_ratio_diff',
    'age_diff',
    'height_cm_diff', 
]

X = wide_data[feature_columns]
y = wide_data['target']

X.to_csv('../processed_data/tennis_model_features.csv')
y.to_csv('../processed_data/tennis_model_labels.csv')

````

