# Tennis Machine Learning Model: 
Training a model on tennis matches to predict future ones.

## Preprocess data

Read data and set dates to datetime format.

In [122]:
import pandas as pd
from glob import glob

# adjust path or filenames to where you saved the CSVs
files = sorted(glob("MatchCSVs/atp_matches_*.csv"))[:5] # Finds all files matching the pattern, sorts them, and takes the first 25 as an example
dfs = [pd.read_csv(f) for f in files] # Read each CSV file into a DataFrame
df = pd.concat(dfs, ignore_index=True) # Concatenate all DataFrames into one, resetting the index

# Quick peek
#print("Rows:", len(df))
#print("Columns:", df.columns.tolist())

# Make sure dates are in datetime format (if not already)
if 'tourney_date' in df.columns:
    df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d', errors='coerce')
else:
    # fallback
    df['tourney_date'] = pd.to_datetime(df['date'], errors='coerce')

#print(df['tourney_date'].min(), "->", df['tourney_date'].max())

Drop walkover matches, rows with missing data, and sort chronologically.

In [123]:
# drop problem matches (RET, walkover)
if 'score' in df.columns:
    df = df[~df['score'].str.contains('RET|Walkover|WO|ret', case=False, na=False)] # Remove rows with RET, Walkover, or similar in the score column

# drop rows with missing key data
df = df.dropna(subset=['winner_id','loser_id','tourney_date'])

# sort chronologically (important for time-based features)
# Build a round sort key so finals come last within each tournament
round_order_map = {
    'RR':  5, 
    'R128': 10, 'R64': 20,
    'R32': 30, 'R16': 40,
    'QF': 50, 'SF': 60,        # round robin before knockouts                                          # bronze/3rd place if present
    'F':   70
}
df['round_order'] = df.get('round').map(round_order_map).fillna(999).astype(int)

# Compose sort columns
sort_cols = ['tourney_date']
if 'tourney_id' in df.columns:
    sort_cols.append('tourney_id')
elif 'tourney_name' in df.columns:
    sort_cols.append('tourney_name')

sort_cols.append('round_order')
if 'match_num' in df.columns:
    sort_cols.append('match_num')

# Sort chronologically and within tournament by round
df = df.sort_values(sort_cols).reset_index(drop=True)
df = df.drop(columns=['round_order'])

display(df['round'].iloc[0:27])

0     R32
1     R32
2     R32
3     R32
4     R32
5     R32
6     R32
7     R32
8     R32
9     R32
10    R32
11    R32
12    R16
13    R16
14    R16
15    R16
16    R16
17    R16
18    R16
19    R16
20     QF
21     QF
22     QF
23     QF
24     SF
25     SF
26      F
Name: round, dtype: object

Create a player-event table for rolling stats (i.e. gather all information before a certain match to use in training)
Build a long "events" table where each match produces two player-events, then compute rolling aggregates shifted so they use only past matches.

In [124]:
events = []
for idx, r in df.iterrows():
    # Winner event
    d = r['tourney_date']
    s = r.get('surface', None)
    events.append({
        'player_name': r['winner_name'],
        'player_id': r['winner_id'],
        'opponent_id': r['loser_id'],
        'date': d,
        'surface': s,
        'is_win': 1,
        'match_id': idx
        # add other stats as needed
    })
    # Loser event
    events.append({
        'player_name': r['loser_name'],
        'player_id': r['loser_id'],
        'opponent_id': r['winner_id'],
        'date': d,
        'surface': s,
        'is_win': 0,
        'match_id': idx
        # add other stats as needed
    })

events_df = pd.DataFrame(events)

events_df = events_df.sort_values('match_id').reset_index(drop=True) 

display(events_df.head())

events_df = events_df.sort_values(['player_id', 'date']).reset_index(drop=True) 

display(events_df.head())


Unnamed: 0,player_name,player_id,opponent_id,date,surface,is_win,match_id
0,Mikhail Kukushkin,105062,104291,2020-01-06,Hard,1,0
1,Malek Jaziri,104291,105062,2020-01-06,Hard,0,0
2,Pierre Hugues Herbert,105732,106065,2020-01-06,Hard,1,1
3,Marco Cecchinato,106065,105732,2020-01-06,Hard,0,1
4,Laslo Djere,111513,132283,2020-01-06,Hard,1,2


Unnamed: 0,player_name,player_id,opponent_id,date,surface,is_win,match_id
0,Alexander Zverev,100644,200282,2020-01-06,Hard,0,34
1,Alexander Zverev,100644,133430,2020-01-06,Hard,0,36
2,Alexander Zverev,100644,126774,2020-01-06,Hard,0,38
3,Alexander Zverev,100644,106065,2020-01-20,Hard,1,197
4,Alexander Zverev,100644,106078,2020-01-20,Hard,1,245


Functions to collect data leading up to a match, including:
- Number of wins in previous n matches
- Total matches played
- Total wins
- Total losses

- Total matches on surface
- Total wins on surface
- Total losses on surface

In [125]:
# wins in last n matches before current match
def wins_last_n(events_df, n=5, colname=None):
    if colname is None:
        colname = f'wins_last_{n}'
    # Group by player_id and calculate rolling sum of wins
    events_df[colname] = (events_df.groupby('player_id')['is_win'] # Group by player_id
                            .transform(lambda s: s.rolling(window=n, min_periods=1).sum().shift(1))
                            .fillna(0)
                            .astype(int)
                          )
    return events_df

# total matches played before current match
def total_matches_before(events_df, colname=None):
    if colname is None:
        colname = 'total_matches_before'
    # Group by player_id and calculate rolling count of matches
    events_df[colname] = events_df.groupby('player_id')['match_id'].cumcount()
    return events_df

# total wins before current match
def total_wins_before(events_df, colname=None):
    if colname is None:
        colname = 'total_wins_before'
    # Group by player_id and calculate cumulative sum of wins
    events_df[colname] = (events_df.groupby('player_id')['is_win']
                          .transform(lambda s: s.cumsum().shift(1))
                          .fillna(0)  # Fill NaN with 0 for players with no previous wins
                          .astype(int))  # Convert to integer
    return events_df

# total losses before current match
def total_losses_before(events_df, colname=None):
    if colname is None:
        colname = 'total_losses_before'
    # Group by player_id and calculate cumulative sum of losses
    events_df[colname] = (events_df.groupby('player_id')['is_win']
                          .transform(lambda x: (x == 0).cumsum().shift(1))  # Count losses and shift to use only past matches
                          .fillna(0)  # Fill NaN with 0 for players with no previous losses
                          .astype(int))
    return events_df

# total matches on same surface before current match
def total_matches_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'total_matches_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')

   
    # Group by player_id and surface, calculate cumulative count of matches
    events_df[colname] = (events_df.groupby('player_surface')['match_id']
                            .cumcount()
                            .fillna(0)  # Fill NaN with 0 for players with no previous matches on that surface
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

# wins on same surface before current match
def wins_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'wins_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')
    
    # Group by player_id and surface, calculate cumulative sum of wins
    events_df[colname] = (events_df.groupby('player_surface')['is_win']
                            .transform(lambda s: s.cumsum().shift(1))
                            .fillna(0)
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

# losses on same surface before current match
def losses_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'losses_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')
    
    # Group by player_id and surface, calculate cumulative sum of losses
    events_df[colname] = (events_df.groupby('player_surface')['is_win']
                            .transform(lambda x: (x == 0).cumsum().shift(1))
                            .fillna(0)
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

Run all the functions to update events_df DataFrame

In [126]:
events_df = wins_last_n(events_df, n=5)
events_df = total_matches_before(events_df)
events_df = total_wins_before(events_df)
events_df = total_losses_before(events_df)
events_df = total_matches_on_surface(events_df)
events_df = wins_on_surface(events_df)
events_df = losses_on_surface(events_df)

Merge pre-match rolling columns from events_df back into original df

In [127]:
# Ensure df has a stable match_id that matches what was used when building events (idx in iterrows)
df['match_id'] = df.index
pd.set_option('display.max_columns', None)

# List of feature columns created in events_df (adjust if you add more functions)
feature_cols = [
    'wins_last_5',
    'total_matches_before',
    'total_wins_before',
    'total_losses_before',
    'total_matches_on_surface',
    'wins_on_surface',
    'losses_on_surface'
]

# Subset DF with only what we need
ev = events_df[['match_id', 'player_id', 'is_win'] + feature_cols]

# Split ev into winners and losers (ev contains both winner and loser rows for each match)
w = ev[ev.is_win == 1].copy() # Boolean mask for winners
l = ev[ev.is_win == 0].copy() # Boolean mask for lasers

# Rename feature columns with side prefixes
w.rename(columns={c: f"winner_{c}" for c in feature_cols}, inplace=True)
l.rename(columns={c: f"loser_{c}" for c in feature_cols}, inplace=True)

# Rename player_id columns wiht side prefixes
w.rename(columns={'player_id': 'winner_player_id'}, inplace=True)
l.rename(columns={'player_id': 'loser_player_id'}, inplace=True)

# Drop helper flag
w.drop(columns=['is_win'], inplace=True)
l.drop(columns=['is_win'], inplace=True)

# Merge onto match-level df
df_with_stats = df.merge(w, on='match_id', how='left')
df_with_stats = df_with_stats.merge(l, on='match_id', how='left')

# Sanity check: winner_player_id should equal winner_id etc.
assert (df_with_stats['winner_player_id'] == df['winner_id']).all()
assert (df_with_stats['loser_player_id'] == df['loser_id']).all()


# Show Djokovic only (104925)
pid = 104925
hist = (events_df[events_df['player_id'] == pid]
        [['match_id','player_name', 'is_win'] + feature_cols])
#display(hist.head(10))

# Create difference features (winner minus loser) – often useful for modeling
diff_cols = []
for c in feature_cols:
    diff_cols.append(f'diff_{c}')
    df_with_stats[f'diff_{c}'] = df_with_stats[f'winner_{c}'] - df_with_stats[f'loser_{c}']

# Find matches between Novak Djokovic (104925) and Rafael Nadal (104745)
duo = df_with_stats[(df_with_stats['winner_id'] == 104925) & (df_with_stats['loser_id'] == 104745) | 
                    (df_with_stats['winner_id'] == 104745) & (df_with_stats['loser_id'] == 104925)][['winner_id', 'loser_id',] + diff_cols]
#display(duo.head(20))

bad = df_with_stats[diff_cols]
#display(bad.head(30))

display(df_with_stats.head())


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,match_id,winner_player_id,winner_wins_last_5,winner_total_matches_before,winner_total_wins_before,winner_total_losses_before,winner_total_matches_on_surface,winner_wins_on_surface,winner_losses_on_surface,loser_player_id,loser_wins_last_5,loser_total_matches_before,loser_total_wins_before,loser_total_losses_before,loser_total_matches_on_surface,loser_wins_on_surface,loser_losses_on_surface,diff_wins_last_5,diff_total_matches_before,diff_total_wins_before,diff_total_losses_before,diff_total_matches_on_surface,diff_wins_on_surface,diff_losses_on_surface
0,2020-0451,Doha,Hard,32,A,2020-01-06,271,105062,,,Mikhail Kukushkin,R,183.0,KAZ,32.0,104291,,WC,Malek Jaziri,R,185.0,TUN,35.9,6-0 6-3,3,R32,68.0,7.0,0.0,51.0,33.0,22.0,12.0,7.0,3.0,3.0,3.0,2.0,44.0,23.0,9.0,10.0,8.0,4.0,9.0,66.0,816.0,229.0,206.0,0,105062,0,0,0,0,0,0,0,104291,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2020-0451,Doha,Hard,32,A,2020-01-06,272,105732,,,Pierre Hugues Herbert,R,188.0,FRA,28.8,106065,,WC,Marco Cecchinato,R,185.0,ITA,27.2,6-3 6-4,3,R32,71.0,6.0,2.0,51.0,30.0,26.0,12.0,10.0,0.0,1.0,2.0,1.0,59.0,32.0,20.0,13.0,9.0,4.0,7.0,65.0,840.0,75.0,690.0,1,105732,0,0,0,0,0,0,0,106065,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2020-0451,Doha,Hard,32,A,2020-01-06,273,111513,5.0,,Laslo Djere,R,188.0,SRB,24.5,132283,,,Lorenzo Sonego,R,191.0,ITA,24.6,6-1 3-6 6-2,3,R32,114.0,5.0,2.0,73.0,48.0,33.0,14.0,12.0,4.0,7.0,5.0,3.0,75.0,47.0,28.0,10.0,12.0,4.0,10.0,39.0,1151.0,51.0,990.0,2,111513,0,0,0,0,0,0,0,132283,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2020-0451,Doha,Hard,32,A,2020-01-06,275,200175,,,Miomir Kecmanovic,R,183.0,SRB,20.3,111442,,,Jordan Thompson,R,183.0,AUS,25.7,6-4 6-2,3,R32,84.0,0.0,1.0,57.0,31.0,23.0,17.0,9.0,0.0,0.0,4.0,1.0,66.0,40.0,26.0,9.0,9.0,7.0,10.0,62.0,881.0,63.0,878.0,3,200175,0,0,0,0,0,0,0,111442,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2020-0451,Doha,Hard,32,A,2020-01-06,276,117356,,WC,Cem Ilkel,R,185.0,TUR,24.3,105575,,,Ricardas Berankis,R,175.0,LTU,29.5,6-2 4-6 6-2,3,R32,112.0,11.0,1.0,92.0,59.0,42.0,17.0,13.0,4.0,5.0,3.0,3.0,84.0,53.0,35.0,16.0,13.0,5.0,9.0,279.0,144.0,67.0,804.0,4,117356,0,0,0,0,0,0,0,105575,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Machine learning part (scikit learn)

In [128]:
# === Simplified Machine Learning with Random Forest Only ===
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Fix the time-based split issue - sort by date first
df_with_stats = df_with_stats.sort_values('tourney_date').reset_index(drop=True)

# Add more meaningful features
def create_enhanced_features(df):
    # Win percentage features
    df['winner_win_pct'] = df['winner_total_wins_before'] / (df['winner_total_matches_before'] + 1)
    df['loser_win_pct'] = df['loser_total_wins_before'] / (df['loser_total_matches_before'] + 1)
    
    # Surface-specific win percentages
    df['winner_surface_win_pct'] = df['winner_wins_on_surface'] / (df['winner_total_matches_on_surface'] + 1)
    df['loser_surface_win_pct'] = df['loser_wins_on_surface'] / (df['loser_total_matches_on_surface'] + 1)
    
    # Recent form (last 5 matches win rate)
    df['winner_recent_form'] = df['winner_wins_last_5'] / 5.0
    df['loser_recent_form'] = df['loser_wins_last_5'] / 5.0
    
    return df

df_with_stats = create_enhanced_features(df_with_stats)

# Create columns for enhanced features
enhanced_features = [
    'winner_win_pct', 'loser_win_pct',
    'winner_surface_win_pct', 'loser_surface_win_pct', 
    'winner_recent_form', 'loser_recent_form',
    'winner_total_matches_before', 'loser_total_matches_before'
]

# Create difference features for enhanced metrics
for feature in enhanced_features:
    if feature.startswith('winner_'):
        loser_feature = feature.replace('winner_', 'loser_')
        if loser_feature in df_with_stats.columns:
            diff_name = f"diff_{feature[7:]}"  # Remove 'winner_' prefix
            df_with_stats[diff_name] = df_with_stats[feature] - df_with_stats[loser_feature]

# Use both original and enhanced difference features
all_diff_cols = [c for c in df_with_stats.columns if c.startswith('diff_')]

#display(df_with_stats.head())
# Build dataset with proper time-based approach
base = df_with_stats[['match_id', 'tourney_date', 'winner_id', 'loser_id'] + all_diff_cols].copy()
#display(base.head())

# Create symmetric dataset but maintain chronological order
pos = base.copy()
pos['player_a_id'] = pos['winner_id'] 
pos['player_b_id'] = pos['loser_id']
pos['y'] = 1

neg = base.copy() 
neg[all_diff_cols] = -neg[all_diff_cols]  # Flip perspective
neg['player_a_id'] = neg['loser_id']
neg['player_b_id'] = neg['winner_id'] 
neg['y'] = 0

model_df = pd.concat([pos, neg], ignore_index=True).sort_values('match_id').reset_index(drop=True)
display(model_df)

# Proper time-based split
split_date = model_df['tourney_date'].quantile(0.8)
train_mask = model_df['tourney_date'] < split_date

X_train = model_df.loc[train_mask, all_diff_cols]
y_train = model_df.loc[train_mask, 'y']
X_test = model_df.loc[~train_mask, all_diff_cols]
y_test = model_df.loc[~train_mask, 'y']

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
print(f"Training period: {model_df.loc[train_mask, 'tourney_date'].min()} to {model_df.loc[train_mask, 'tourney_date'].max()}")
print(f"Test period: {model_df.loc[~train_mask, 'tourney_date'].min()} to {model_df.loc[~train_mask, 'tourney_date'].max()}")

# Random Forest model with pipeline
rf_model = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        class_weight='balanced'
    ))
])

print("\n=== Random Forest ===")
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
proba = rf_model.predict_proba(X_test)[:,1]

accuracy = accuracy_score(y_test, pred)
if y_test.nunique() == 2:
    roc_auc = roc_auc_score(y_test, proba)
    print(f"ROC AUC: {roc_auc:.3f}")
else:
    print("ROC AUC: skipped (only one class in test)")

print(f"Accuracy: {accuracy:.3f}")

# Feature importance
importances = rf_model.named_steps['rf'].feature_importances_
fi = pd.Series(importances, index=all_diff_cols).sort_values(ascending=False)
print("\nTop 10 Most Important Features:")
display(fi.head(10))

# Baseline comparison
baseline_accuracy = max(y_test.mean(), 1 - y_test.mean())
print(f"\nBaseline accuracy (predict majority class): {baseline_accuracy:.3f}")
print(f"Model improvement over baseline: {accuracy - baseline_accuracy:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, pred))

Unnamed: 0,match_id,tourney_date,winner_id,loser_id,diff_wins_last_5,diff_total_matches_before,diff_total_wins_before,diff_total_losses_before,diff_total_matches_on_surface,diff_wins_on_surface,diff_losses_on_surface,diff_win_pct,diff_surface_win_pct,diff_recent_form,player_a_id,player_b_id,y
0,0,2020-01-06,105062,104291,0,0,0,0,0,0,0,0.000000,0.000000,0.0,105062,104291,1
1,0,2020-01-06,105062,104291,0,0,0,0,0,0,0,-0.000000,-0.000000,-0.0,104291,105062,0
2,1,2020-01-06,105732,106065,0,0,0,0,0,0,0,-0.000000,-0.000000,-0.0,106065,105732,0
3,1,2020-01-06,105732,106065,0,0,0,0,0,0,0,0.000000,0.000000,0.0,105732,106065,1
4,2,2020-01-06,111513,132283,0,0,0,0,0,0,0,-0.000000,-0.000000,-0.0,132283,111513,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25713,12856,2024-12-18,210530,210506,-2,-66,-34,-32,-41,-22,-19,-0.051948,-0.064685,-0.4,210530,210506,1
25714,12857,2024-12-18,211663,209414,-3,40,11,29,28,7,21,-0.170384,-0.371429,-0.6,209414,211663,0
25715,12857,2024-12-18,211663,209414,3,-40,-11,-29,-28,-7,-21,0.170384,0.371429,0.6,211663,209414,1
25716,12858,2024-12-18,211663,210530,2,8,5,3,-4,0,-4,0.050000,0.250000,0.4,211663,210530,1


Training samples: 20524, Test samples: 5194
Training period: 2020-01-06 00:00:00 to 2024-02-03 00:00:00
Test period: 2024-02-05 00:00:00 to 2024-12-18 00:00:00

=== Random Forest ===
ROC AUC: 0.694
Accuracy: 0.630

Top 10 Most Important Features:


diff_win_pct                     0.231678
diff_surface_win_pct             0.170843
diff_total_wins_before           0.152573
diff_wins_on_surface             0.110387
diff_total_matches_before        0.084170
diff_total_matches_on_surface    0.074496
diff_total_losses_before         0.065480
diff_losses_on_surface           0.048476
diff_wins_last_5                 0.032347
diff_recent_form                 0.029549
dtype: float64


Baseline accuracy (predict majority class): 0.500
Model improvement over baseline: 0.130

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.63      0.63      2597
           1       0.63      0.63      0.63      2597

    accuracy                           0.63      5194
   macro avg       0.63      0.63      0.63      5194
weighted avg       0.63      0.63      0.63      5194



In [140]:
# === Tournament Prediction Pipeline ===
# === Tournament Prediction Pipeline ===
def predict_tournament_matches(model, df_with_stats, tournament_matches):
    """
    Predict winners for upcoming tournament matches
    
    Parameters:
    - model: trained sklearn pipeline
    - df_with_stats: historical data with player stats
    - tournament_matches: list of dicts with 'player_a_id', 'player_b_id', 'surface'
    
    Returns:
    - predictions DataFrame with match details and predictions
    """
    predictions = []
    
    # Get latest stats for each player (most recent match before tournament)
    winner_cols = ['winner_id'] + [c for c in df_with_stats.columns if c.startswith('winner_') and c != 'winner_id']
    loser_cols = ['loser_id'] + [c for c in df_with_stats.columns if c.startswith('loser_') and c != 'loser_id']
    
    latest_stats_winner = df_with_stats.groupby('winner_id').tail(1)[winner_cols].copy()
    latest_stats_loser = df_with_stats.groupby('loser_id').tail(1)[loser_cols].copy()
    
    # Rename winner columns to have consistent player_id column
    winner_rename_dict = {'winner_id': 'player_id'}
    winner_renamed = latest_stats_winner.rename(columns=winner_rename_dict)
    
    # Rename loser columns to winner format for easier lookup
    loser_rename_dict = {'loser_id': 'player_id'}
    # Only rename loser_ columns to winner_ format, excluding the ID column
    for c in loser_cols:
        if c.startswith('loser_') and c != 'loser_id':
            winner_col = c.replace('loser_', 'winner_')
            loser_rename_dict[c] = winner_col
    
    loser_renamed = latest_stats_loser.rename(columns=loser_rename_dict)
    
    # Combine both datasets (player stats from when they were winner or loser)
    all_player_stats = pd.concat([winner_renamed, loser_renamed], ignore_index=True)
    # Take the most recent stats for each player
    all_player_stats = all_player_stats.drop_duplicates(subset=['player_id'], keep='last')
    
    feature_cols = [c for c in df_with_stats.columns if c.startswith('diff_')]
    
    for match in tournament_matches:
        player_a_id = match['player_a_id']
        player_b_id = match['player_b_id']
        surface = match.get('surface', 'Clay')  # Default to Hard court
        
        # Get stats for both players
        player_a_stats = all_player_stats[all_player_stats['player_id'] == player_a_id]
        player_b_stats = all_player_stats[all_player_stats['player_id'] == player_b_id]
        
        if player_a_stats.empty or player_b_stats.empty:
            print(f"Warning: Missing stats for match {player_a_id} vs {player_b_id}")
            continue
            
        # Calculate difference features (player_a - player_b)
        match_features = {}
        
        # Map enhanced feature names to difference feature names
        enhanced_mapping = {
            'winner_win_pct': 'diff_win_pct',
            'winner_surface_win_pct': 'diff_surface_win_pct',
            'winner_recent_form': 'diff_recent_form',
            'winner_total_matches_before': 'diff_total_matches_before'
        }
        
        # Original difference features
        original_mapping = {
            'winner_wins_last_5': 'diff_wins_last_5',
            'winner_total_wins_before': 'diff_total_wins_before',
            'winner_total_losses_before': 'diff_total_losses_before',
            'winner_total_matches_on_surface': 'diff_total_matches_on_surface',
            'winner_wins_on_surface': 'diff_wins_on_surface',
            'winner_losses_on_surface': 'diff_losses_on_surface'
        }
        
        all_mapping = {**enhanced_mapping, **original_mapping}
        
        for winner_col, diff_col in all_mapping.items():
            if winner_col in player_a_stats.columns and winner_col in player_b_stats.columns:
                player_a_val = player_a_stats[winner_col].iloc[0]
                player_b_val = player_b_stats[winner_col].iloc[0]
                match_features[diff_col] = player_a_val - player_b_val
        
        # Create feature vector for prediction
        X_match = pd.DataFrame([match_features])
        
        # Ensure all required features are present
        for col in feature_cols:
            if col not in X_match.columns:
                X_match[col] = 0
        
        # Reorder columns to match training data
        X_match = X_match[feature_cols]
        
        # Make prediction
        prob = model.predict_proba(X_match)[0]
        prediction = model.predict(X_match)[0]
        
        predictions.append({
            'player_a_id': player_a_id,
            'player_b_id': player_b_id,
            'surface': surface,
            'predicted_winner': player_a_id if prediction == 1 else player_b_id,
            'player_a_win_prob': prob[1],
            'player_b_win_prob': prob[0],
            'confidence': max(prob)
        })
    
    return pd.DataFrame(predictions)

# Example usage for upcoming tournament
tournament_matches = [
    {'player_a_id': 206173, 'player_b_id': 200240, 'surface': 'Hard'},
    {'player_a_id': 104925, 'player_b_id': 210530, 'surface': 'Hard'},
    {'player_a_id': 207989, 'player_b_id': 206173, 'surface': 'Clay'},
    # Add more matches...
]

# Make predictions
predictions_df = predict_tournament_matches(rf_model, df_with_stats, tournament_matches)
print("Tournament Predictions:")
display(predictions_df)

# Function to get player names (if you have a player lookup table)
def add_player_names(predictions_df, player_lookup):
    """
    Add player names to predictions DataFrame
    player_lookup should be a dict: {player_id: player_name}
    """
    predictions_df['player_a_name'] = predictions_df['player_a_id'].map(player_lookup)
    predictions_df['player_b_name'] = predictions_df['player_b_id'].map(player_lookup)
    predictions_df['predicted_winner_name'] = predictions_df['predicted_winner'].map(player_lookup)
    return predictions_df

# Create player lookup from your data
player_lookup = dict(zip(df['winner_id'], df['winner_name']))
player_lookup.update(dict(zip(df['loser_id'], df['loser_name'])))

# Add names to predictions
predictions_with_names = add_player_names(predictions_df.copy(), player_lookup)
print("\nPredictions with Player Names:")
display(predictions_with_names[['player_a_name', 'player_b_name', 'predicted_winner_name', 'confidence']])

Tournament Predictions:


Unnamed: 0,player_a_id,player_b_id,surface,predicted_winner,player_a_win_prob,player_b_win_prob,confidence
0,206173,200240,Hard,206173,0.89267,0.10733,0.89267
1,104925,210530,Hard,104925,0.926206,0.073794,0.926206
2,207989,206173,Clay,206173,0.350087,0.649913,0.649913



Predictions with Player Names:


Unnamed: 0,player_a_name,player_b_name,predicted_winner_name,confidence
0,Jannik Sinner,Vit Kopriva,Jannik Sinner,0.89267
1,Novak Djokovic,Learner Tien,Novak Djokovic,0.926206
2,Carlos Alcaraz,Jannik Sinner,Jannik Sinner,0.649913
