# Tennis Machine Learning Model: 
Training a model on tennis matches to predict future ones.

## Preprocess data

Read data and set dates to datetime format.

In [101]:
import pandas as pd
from glob import glob

# adjust path or filenames to where you saved the CSVs
files = sorted(glob("MatchCSVs/atp_matches_*.csv"))[:5] # Finds all files matching the pattern, sorts them, and takes the first 25 as an example
dfs = [pd.read_csv(f) for f in files] # Read each CSV file into a DataFrame
df = pd.concat(dfs, ignore_index=True) # Concatenate all DataFrames into one, resetting the index

# Quick peek
#print("Rows:", len(df))
#print("Columns:", df.columns.tolist())

# Make sure dates are in datetime format (if not already)
if 'tourney_date' in df.columns:
    df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d', errors='coerce')
else:
    # fallback
    df['tourney_date'] = pd.to_datetime(df['date'], errors='coerce')

#print(df['tourney_date'].min(), "->", df['tourney_date'].max())

Drop walkover matches, rows with missing data, and sort chronologically.

In [102]:
# drop problem matches (RET, walkover)
if 'score' in df.columns:
    df = df[~df['score'].str.contains('RET|Walkover|WO|ret', case=False, na=False)] # Remove rows with RET, Walkover, or similar in the score column

# drop rows with missing key data
df = df.dropna(subset=['winner_id','loser_id','tourney_date'])

# sort chronologically (important for time-based features)
# Build a round sort key so finals come last within each tournament
round_order_map = {
    'RR':  5, 
    'R128': 10, 'R64': 20,
    'R32': 30, 'R16': 40,
    'QF': 50, 'SF': 60,        # round robin before knockouts                                          # bronze/3rd place if present
    'F':   70
}
df['round_order'] = df.get('round').map(round_order_map).fillna(999).astype(int)

# Compose sort columns
sort_cols = ['tourney_date']
if 'tourney_id' in df.columns:
    sort_cols.append('tourney_id')
elif 'tourney_name' in df.columns:
    sort_cols.append('tourney_name')

sort_cols.append('round_order')
if 'match_num' in df.columns:
    sort_cols.append('match_num')

# Sort chronologically and within tournament by round
df = df.sort_values(sort_cols).reset_index(drop=True)
df = df.drop(columns=['round_order'])

display(df['round'].iloc[0:27])

0     R32
1     R32
2     R32
3     R32
4     R32
5     R32
6     R32
7     R32
8     R32
9     R32
10    R32
11    R32
12    R16
13    R16
14    R16
15    R16
16    R16
17    R16
18    R16
19    R16
20     QF
21     QF
22     QF
23     QF
24     SF
25     SF
26      F
Name: round, dtype: object

Create a player-event table for rolling stats (i.e. gather all information before a certain match to use in training)
Build a long "events" table where each match produces two player-events, then compute rolling aggregates shifted so they use only past matches.

In [103]:
events = []
for idx, r in df.iterrows():
    # Winner event
    d = r['tourney_date']
    s = r.get('surface', None)
    events.append({
        'player_name': r['winner_name'],
        'player_id': r['winner_id'],
        'opponent_id': r['loser_id'],
        'date': d,
        'surface': s,
        'is_win': 1,
        'match_id': idx
        # add other stats as needed
    })
    # Loser event
    events.append({
        'player_name': r['loser_name'],
        'player_id': r['loser_id'],
        'opponent_id': r['winner_id'],
        'date': d,
        'surface': s,
        'is_win': 0,
        'match_id': idx
        # add other stats as needed
    })

events_df = pd.DataFrame(events)

events_df = events_df.sort_values('match_id').reset_index(drop=True) 

display(events_df.head())

events_df = events_df.sort_values(['player_id', 'date']).reset_index(drop=True) 

display(events_df.head())


Unnamed: 0,player_name,player_id,opponent_id,date,surface,is_win,match_id
0,Mikhail Kukushkin,105062,104291,2020-01-06,Hard,1,0
1,Malek Jaziri,104291,105062,2020-01-06,Hard,0,0
2,Pierre Hugues Herbert,105732,106065,2020-01-06,Hard,1,1
3,Marco Cecchinato,106065,105732,2020-01-06,Hard,0,1
4,Laslo Djere,111513,132283,2020-01-06,Hard,1,2


Unnamed: 0,player_name,player_id,opponent_id,date,surface,is_win,match_id
0,Alexander Zverev,100644,200282,2020-01-06,Hard,0,34
1,Alexander Zverev,100644,133430,2020-01-06,Hard,0,36
2,Alexander Zverev,100644,126774,2020-01-06,Hard,0,38
3,Alexander Zverev,100644,106065,2020-01-20,Hard,1,197
4,Alexander Zverev,100644,106078,2020-01-20,Hard,1,245


Functions to collect data leading up to a match, including:
- Number of wins in previous n matches
- Total matches played
- Total wins
- Total losses

- Total matches on surface
- Total wins on surface
- Total losses on surface

In [104]:
# wins in last n matches before current match
def wins_last_n(events_df, n=5, colname=None):
    if colname is None:
        colname = f'wins_last_{n}'
    # Group by player_id and calculate rolling sum of wins
    events_df[colname] = (events_df.groupby('player_id')['is_win'] # Group by player_id
                            .transform(lambda s: s.rolling(window=n, min_periods=1).sum().shift(1))
                            .fillna(0)
                            .astype(int)
                          )
    return events_df

# total matches played before current match
def total_matches_before(events_df, colname=None):
    if colname is None:
        colname = 'total_matches_before'
    # Group by player_id and calculate rolling count of matches
    events_df[colname] = events_df.groupby('player_id')['match_id'].cumcount()
    return events_df

# total wins before current match
def total_wins_before(events_df, colname=None):
    if colname is None:
        colname = 'total_wins_before'
    # Group by player_id and calculate cumulative sum of wins
    events_df[colname] = (events_df.groupby('player_id')['is_win']
                          .transform(lambda s: s.cumsum().shift(1))
                          .fillna(0)  # Fill NaN with 0 for players with no previous wins
                          .astype(int))  # Convert to integer
    return events_df

# total losses before current match
def total_losses_before(events_df, colname=None):
    if colname is None:
        colname = 'total_losses_before'
    # Group by player_id and calculate cumulative sum of losses
    events_df[colname] = (events_df.groupby('player_id')['is_win']
                          .transform(lambda x: (x == 0).cumsum().shift(1))  # Count losses and shift to use only past matches
                          .fillna(0)  # Fill NaN with 0 for players with no previous losses
                          .astype(int))
    return events_df

# total matches on same surface before current match
def total_matches_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'total_matches_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')

   
    # Group by player_id and surface, calculate cumulative count of matches
    events_df[colname] = (events_df.groupby('player_surface')['match_id']
                            .cumcount()
                            .fillna(0)  # Fill NaN with 0 for players with no previous matches on that surface
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

# wins on same surface before current match
def wins_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'wins_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')
    
    # Group by player_id and surface, calculate cumulative sum of wins
    events_df[colname] = (events_df.groupby('player_surface')['is_win']
                            .transform(lambda s: s.cumsum().shift(1))
                            .fillna(0)
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

# losses on same surface before current match
def losses_on_surface(events_df, colname=None):
    if colname is None:
        colname = 'losses_on_surface'
    
    # Create a temporary column to help with grouping
    events_df['player_surface'] = events_df['player_id'].astype(str) + '_' + events_df['surface'].fillna('Unknown')
    
    # Group by player_id and surface, calculate cumulative sum of losses
    events_df[colname] = (events_df.groupby('player_surface')['is_win']
                            .transform(lambda x: (x == 0).cumsum().shift(1))
                            .fillna(0)
                            .astype(int))
    
    # Drop the temporary column
    events_df.drop('player_surface', axis=1, inplace=True)
    
    return events_df

Run all the functions to update events_df DataFrame

In [105]:
events_df = wins_last_n(events_df, n=5)
events_df = total_matches_before(events_df)
events_df = total_wins_before(events_df)
events_df = total_losses_before(events_df)
events_df = total_matches_on_surface(events_df)
events_df = wins_on_surface(events_df)
events_df = losses_on_surface(events_df)

Merge pre-match rolling columns from events_df back into original df

In [106]:
# Ensure df has a stable match_id that matches what was used when building events (idx in iterrows)
df['match_id'] = df.index
pd.set_option('display.max_columns', None)

# List of feature columns created in events_df (adjust if you add more functions)
feature_cols = [
    'wins_last_5',
    'total_matches_before',
    'total_wins_before',
    'total_losses_before',
    'total_matches_on_surface',
    'wins_on_surface',
    'losses_on_surface'
]

# Subset DF with only what we need
ev = events_df[['match_id', 'player_id', 'is_win'] + feature_cols]

# Split ev into winners and losers (ev contains both winner and loser rows for each match)
w = ev[ev.is_win == 1].copy() # Boolean mask for winners
l = ev[ev.is_win == 0].copy() # Boolean mask for lasers

# Rename feature columns with side prefixes
w.rename(columns={c: f"winner_{c}" for c in feature_cols}, inplace=True)
l.rename(columns={c: f"loser_{c}" for c in feature_cols}, inplace=True)

# Rename player_id columns wiht side prefixes
w.rename(columns={'player_id': 'winner_player_id'}, inplace=True)
l.rename(columns={'player_id': 'loser_player_id'}, inplace=True)

# Drop helper flag
w.drop(columns=['is_win'], inplace=True)
l.drop(columns=['is_win'], inplace=True)

# Merge onto match-level df
df_with_stats = df.merge(w, on='match_id', how='left')
df_with_stats = df_with_stats.merge(l, on='match_id', how='left')

# Sanity check: winner_player_id should equal winner_id etc.
assert (df_with_stats['winner_player_id'] == df['winner_id']).all()
assert (df_with_stats['loser_player_id'] == df['loser_id']).all()


# Show Djokovic only (104925)
pid = 104925
hist = (events_df[events_df['player_id'] == pid]
        [['match_id','player_name', 'is_win'] + feature_cols])
#display(hist.head(10))

# Create difference features (winner minus loser) – often useful for modeling
diff_cols = []
for c in feature_cols:
    diff_cols.append(f'diff_{c}')
    df_with_stats[f'diff_{c}'] = df_with_stats[f'winner_{c}'] - df_with_stats[f'loser_{c}']

# Find matches between Novak Djokovic (104925) and Rafael Nadal (104745)
duo = df_with_stats[(df_with_stats['winner_id'] == 104925) & (df_with_stats['loser_id'] == 104745) | 
                    (df_with_stats['winner_id'] == 104745) & (df_with_stats['loser_id'] == 104925)][['winner_id', 'loser_id',] + diff_cols]
#display(duo.head(20))

bad = df_with_stats[diff_cols]
#display(bad.head(30))

display(df_with_stats.head())


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,match_id,winner_player_id,winner_wins_last_5,winner_total_matches_before,winner_total_wins_before,winner_total_losses_before,winner_total_matches_on_surface,winner_wins_on_surface,winner_losses_on_surface,loser_player_id,loser_wins_last_5,loser_total_matches_before,loser_total_wins_before,loser_total_losses_before,loser_total_matches_on_surface,loser_wins_on_surface,loser_losses_on_surface,diff_wins_last_5,diff_total_matches_before,diff_total_wins_before,diff_total_losses_before,diff_total_matches_on_surface,diff_wins_on_surface,diff_losses_on_surface
0,2020-0451,Doha,Hard,32,A,2020-01-06,271,105062,,,Mikhail Kukushkin,R,183.0,KAZ,32.0,104291,,WC,Malek Jaziri,R,185.0,TUN,35.9,6-0 6-3,3,R32,68.0,7.0,0.0,51.0,33.0,22.0,12.0,7.0,3.0,3.0,3.0,2.0,44.0,23.0,9.0,10.0,8.0,4.0,9.0,66.0,816.0,229.0,206.0,0,105062,0,0,0,0,0,0,0,104291,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2020-0451,Doha,Hard,32,A,2020-01-06,272,105732,,,Pierre Hugues Herbert,R,188.0,FRA,28.8,106065,,WC,Marco Cecchinato,R,185.0,ITA,27.2,6-3 6-4,3,R32,71.0,6.0,2.0,51.0,30.0,26.0,12.0,10.0,0.0,1.0,2.0,1.0,59.0,32.0,20.0,13.0,9.0,4.0,7.0,65.0,840.0,75.0,690.0,1,105732,0,0,0,0,0,0,0,106065,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2020-0451,Doha,Hard,32,A,2020-01-06,273,111513,5.0,,Laslo Djere,R,188.0,SRB,24.5,132283,,,Lorenzo Sonego,R,191.0,ITA,24.6,6-1 3-6 6-2,3,R32,114.0,5.0,2.0,73.0,48.0,33.0,14.0,12.0,4.0,7.0,5.0,3.0,75.0,47.0,28.0,10.0,12.0,4.0,10.0,39.0,1151.0,51.0,990.0,2,111513,0,0,0,0,0,0,0,132283,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2020-0451,Doha,Hard,32,A,2020-01-06,275,200175,,,Miomir Kecmanovic,R,183.0,SRB,20.3,111442,,,Jordan Thompson,R,183.0,AUS,25.7,6-4 6-2,3,R32,84.0,0.0,1.0,57.0,31.0,23.0,17.0,9.0,0.0,0.0,4.0,1.0,66.0,40.0,26.0,9.0,9.0,7.0,10.0,62.0,881.0,63.0,878.0,3,200175,0,0,0,0,0,0,0,111442,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2020-0451,Doha,Hard,32,A,2020-01-06,276,117356,,WC,Cem Ilkel,R,185.0,TUR,24.3,105575,,,Ricardas Berankis,R,175.0,LTU,29.5,6-2 4-6 6-2,3,R32,112.0,11.0,1.0,92.0,59.0,42.0,17.0,13.0,4.0,5.0,3.0,3.0,84.0,53.0,35.0,16.0,13.0,5.0,9.0,279.0,144.0,67.0,804.0,4,117356,0,0,0,0,0,0,0,105575,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Machine learning part (scikit learn)

In [107]:
# === Simplified Machine Learning with Random Forest Only ===
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Fix the time-based split issue - sort by date first
df_with_stats = df_with_stats.sort_values('tourney_date').reset_index(drop=True)

# Add more meaningful features
def create_enhanced_features(df):
    # Win percentage features
    df['winner_win_pct'] = df['winner_total_wins_before'] / (df['winner_total_matches_before'] + 1)
    df['loser_win_pct'] = df['loser_total_wins_before'] / (df['loser_total_matches_before'] + 1)
    
    # Surface-specific win percentages
    df['winner_surface_win_pct'] = df['winner_wins_on_surface'] / (df['winner_total_matches_on_surface'] + 1)
    df['loser_surface_win_pct'] = df['loser_wins_on_surface'] / (df['loser_total_matches_on_surface'] + 1)
    
    # Recent form (last 5 matches win rate)
    df['winner_recent_form'] = df['winner_wins_last_5'] / 5.0
    df['loser_recent_form'] = df['loser_wins_last_5'] / 5.0

    # Add surface encoding
    #surface_dummies = pd.get_dummies(df['surface'], prefix='surface')
    #df = pd.concat([df, surface_dummies], axis=1)

    return df

df_with_stats = create_enhanced_features(df_with_stats)

# Create columns for enhanced features
enhanced_features = [
    'winner_win_pct', 'loser_win_pct',
    'winner_surface_win_pct', 'loser_surface_win_pct', 
    'winner_recent_form', 'loser_recent_form',
    'winner_total_matches_before', 'loser_total_matches_before'
]

# Create difference features for enhanced metrics
for feature in enhanced_features:
    if feature.startswith('winner_'):
        loser_feature = feature.replace('winner_', 'loser_')
        if loser_feature in df_with_stats.columns:
            diff_name = f"diff_{feature[7:]}"  # Remove 'winner_' prefix
            df_with_stats[diff_name] = df_with_stats[feature] - df_with_stats[loser_feature]

# Add surface columns to your feature list
#surface_cols = [col for col in df_with_stats.columns if col.startswith('surface_')]

all_diff_cols = [c for c in df_with_stats.columns if c.startswith('diff_')]

#display(df_with_stats.head())
# Build dataset with proper time-based approach
base = df_with_stats[['match_id', 'tourney_date', 'winner_id', 'loser_id'] + all_diff_cols].copy()
#display(base.head())

# Create symmetric dataset but maintain chronological order
pos = base.copy()
pos['player_a_id'] = pos['winner_id'] 
pos['player_b_id'] = pos['loser_id']
pos['y'] = 1

neg = base.copy() 
neg[all_diff_cols] = -neg[all_diff_cols]  # Flip perspective
neg['player_a_id'] = neg['loser_id']
neg['player_b_id'] = neg['winner_id'] 
neg['y'] = 0

model_df = pd.concat([pos, neg], ignore_index=True).sort_values('match_id').reset_index(drop=True)
#display(model_df)

# Proper time-based split
split_date = model_df['tourney_date'].quantile(0.8)
train_mask = model_df['tourney_date'] < split_date

X_train = model_df.loc[train_mask, all_diff_cols]
y_train = model_df.loc[train_mask, 'y']
X_test = model_df.loc[~train_mask, all_diff_cols]
y_test = model_df.loc[~train_mask, 'y']

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
print(f"Training period: {model_df.loc[train_mask, 'tourney_date'].min()} to {model_df.loc[train_mask, 'tourney_date'].max()}")
print(f"Test period: {model_df.loc[~train_mask, 'tourney_date'].min()} to {model_df.loc[~train_mask, 'tourney_date'].max()}")

# Random Forest model with pipeline
rf_model = Pipeline([
    ('impute', SimpleImputer(strategy='median')), # Fills missing values with the median
    ('scaler', StandardScaler()), # Normalize all features to have zero mean and unit variance
    ('rf', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        class_weight='balanced'
    ))
])

print("\n=== Random Forest ===")
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
proba = rf_model.predict_proba(X_test)[:,1]

accuracy = accuracy_score(y_test, pred)
if y_test.nunique() == 2:
    roc_auc = roc_auc_score(y_test, proba)
    print(f"ROC AUC: {roc_auc:.3f}")
else:
    print("ROC AUC: skipped (only one class in test)")

print(f"Accuracy: {accuracy:.3f}")

# Feature importance
importances = rf_model.named_steps['rf'].feature_importances_
fi = pd.Series(importances, index=all_diff_cols).sort_values(ascending=False)
print("\nTop 10 Most Important Features:")
display(fi.head(10))

# Baseline comparison
baseline_accuracy = max(y_test.mean(), 1 - y_test.mean())
print(f"\nBaseline accuracy (predict majority class): {baseline_accuracy:.3f}")
print(f"Model improvement over baseline: {accuracy - baseline_accuracy:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, pred))

Training samples: 20524, Test samples: 5194
Training period: 2020-01-06 00:00:00 to 2024-02-03 00:00:00
Test period: 2024-02-05 00:00:00 to 2024-12-18 00:00:00

=== Random Forest ===
ROC AUC: 0.693
Accuracy: 0.629

Top 10 Most Important Features:


diff_win_pct                     0.231678
diff_surface_win_pct             0.170843
diff_total_wins_before           0.152573
diff_wins_on_surface             0.110387
diff_total_matches_before        0.084170
diff_total_matches_on_surface    0.074496
diff_total_losses_before         0.065480
diff_losses_on_surface           0.048476
diff_wins_last_5                 0.032347
diff_recent_form                 0.029549
dtype: float64


Baseline accuracy (predict majority class): 0.500
Model improvement over baseline: 0.129

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.63      0.63      2597
           1       0.63      0.63      0.63      2597

    accuracy                           0.63      5194
   macro avg       0.63      0.63      0.63      5194
weighted avg       0.63      0.63      0.63      5194



In [108]:
# === Tournament Prediction Pipeline ===
def predict_match(model, df_with_stats, tournament_matches):
    """
    Predict winners for upcoming tournament matches
    
    Parameters:
    - model: trained sklearn pipeline
    - df_with_stats: historical data with player stats
    - tournament_matches: list of dicts with 'player_a_id', 'player_b_id', 'surface'
    
    Returns:
    - predictions DataFrame with match details and predictions
    """
    predictions = []
    
    # Get latest stats for each player (most recent match before tournament)
    winner_cols = ['winner_id'] + [c for c in df_with_stats.columns if c.startswith('winner_') and c != 'winner_id']
    loser_cols = ['loser_id'] + [c for c in df_with_stats.columns if c.startswith('loser_') and c != 'loser_id']
    
    latest_stats_winner = df_with_stats.groupby('winner_id').tail(1)[winner_cols].copy()
    latest_stats_loser = df_with_stats.groupby('loser_id').tail(1)[loser_cols].copy()
    
    # Rename winner columns to have consistent player_id column
    winner_rename_dict = {'winner_id': 'player_id'}
    winner_renamed = latest_stats_winner.rename(columns=winner_rename_dict)
    
    # Rename loser columns to winner format for easier lookup
    loser_rename_dict = {'loser_id': 'player_id'}
    # Only rename loser_ columns to winner_ format, excluding the ID column
    for c in loser_cols:
        if c.startswith('loser_') and c != 'loser_id':
            winner_col = c.replace('loser_', 'winner_')
            loser_rename_dict[c] = winner_col
    
    loser_renamed = latest_stats_loser.rename(columns=loser_rename_dict)
    
    # Combine both datasets (player stats from when they were winner or loser)
    all_player_stats = pd.concat([winner_renamed, loser_renamed], ignore_index=True)
    # Take the most recent stats for each player
    all_player_stats = all_player_stats.drop_duplicates(subset=['player_id'], keep='last')
    
    feature_cols = [c for c in df_with_stats.columns if c.startswith('diff_')]
    #surface_cols = [col for col in df_with_stats.columns if col.startswith('surface_')]
    all_feature_cols = feature_cols  # Match the training features exactly
    
    for match in tournament_matches:
        player_a_id = match['player_a_id']
        player_b_id = match['player_b_id']
        surface = match.get('surface', 'Hard')  # Default to Hard court
        
        # Get stats for both players

        '''
        player_a_surface_stats = df_with_stats[
            (df_with_stats['winner_id'] == player_a_id) & (df_with_stats['surface'] == surface) |
            (df_with_stats['loser_id'] == player_a_id) & (df_with_stats['surface'] == surface)
        ].tail(1)
        
        player_b_surface_stats = df_with_stats[
            (df_with_stats['winner_id'] == player_b_id) & (df_with_stats['surface'] == surface) |
            (df_with_stats['loser_id'] == player_b_id) & (df_with_stats['surface'] == surface)
        ].tail(1)
        
        # If no surface-specific stats, fall back to overall stats
        if player_a_surface_stats.empty:
            player_a_stats = all_player_stats[all_player_stats['player_id'] == player_a_id]
        else:
            # Extract player A stats from the surface-specific match
            if player_a_surface_stats['winner_id'].iloc[0] == player_a_id:
                player_a_stats = player_a_surface_stats[['winner_' + col for col in ['win_pct', 'surface_win_pct', 'recent_form', 'total_matches_before']]].rename(
                    columns=lambda x: x.replace('winner_', 'winner_')
                )
            else:
                player_a_stats = player_a_surface_stats[['loser_' + col for col in ['win_pct', 'surface_win_pct', 'recent_form', 'total_matches_before']]].rename(
                    columns=lambda x: x.replace('loser_', 'winner_')
                )
        
        if player_b_surface_stats.empty:
            player_b_stats = all_player_stats[all_player_stats['player_id'] == player_b_id]
        else:
            # Extract player A stats from the surface-specific match
            if player_b_surface_stats['winner_id'].iloc[0] == player_b_id:
                player_b_stats = player_b_surface_stats[['winner_' + col for col in ['win_pct', 'surface_win_pct', 'recent_form', 'total_matches_before']]].rename(
                    columns=lambda x: x.replace('winner_', 'winner_')
                )
            else:
                player_b_stats = player_b_surface_stats[['loser_' + col for col in ['win_pct', 'surface_win_pct', 'recent_form', 'total_matches_before']]].rename(
                    columns=lambda x: x.replace('loser_', 'winner_')
                )
        '''


        player_a_stats = all_player_stats[all_player_stats['player_id'] == player_a_id]
        player_b_stats = all_player_stats[all_player_stats['player_id'] == player_b_id]
        
        if player_a_stats.empty or player_b_stats.empty:
            print(f"Warning: Missing stats for match {player_a_id} vs {player_b_id}")
            continue
            
        # Calculate difference features (player_a - player_b)
        match_features = {}
        
        # Map enhanced feature names to difference feature names
        enhanced_mapping = {
            'winner_win_pct': 'diff_win_pct',
            'winner_surface_win_pct': 'diff_surface_win_pct',
            'winner_recent_form': 'diff_recent_form',
            'winner_total_matches_before': 'diff_total_matches_before'
        }
        
        # Original difference features
        original_mapping = {
            'winner_wins_last_5': 'diff_wins_last_5',
            'winner_total_wins_before': 'diff_total_wins_before',
            'winner_total_losses_before': 'diff_total_losses_before',
            'winner_total_matches_on_surface': 'diff_total_matches_on_surface',
            'winner_wins_on_surface': 'diff_wins_on_surface',
            'winner_losses_on_surface': 'diff_losses_on_surface'
        }
        
        all_mapping = {**enhanced_mapping, **original_mapping}
        
        for winner_col, diff_col in all_mapping.items():
            if winner_col in player_a_stats.columns and winner_col in player_b_stats.columns:
                player_a_val = player_a_stats[winner_col].iloc[0]
                player_b_val = player_b_stats[winner_col].iloc[0]
                match_features[diff_col] = player_a_val - player_b_val
        
        # Create feature vector for prediction
        X_match = pd.DataFrame([match_features])
        
        # Ensure all required features are present
        for col in all_feature_cols:
            if col not in X_match.columns:
                X_match[col] = 0
        
        # Reorder columns to match training data
        X_match = X_match[all_feature_cols]
        
        # Make prediction
        prob = model.predict_proba(X_match)[0]
        prediction = model.predict(X_match)[0]
        
        predictions.append({
            'player_a_id': player_a_id,
            'player_b_id': player_b_id,
            'surface': surface,
            'predicted_winner': player_a_id if prediction == 1 else player_b_id,
            'player_a_win_prob': prob[1],
            'player_b_win_prob': prob[0],
            'confidence': max(prob)
        })
    
    return pd.DataFrame(predictions)

# Example usage for upcoming tournament
tournament_matches = [
    {'player_a_id': 206173, 'player_b_id': 200240, 'surface': 'Hard'},
    {'player_a_id': 104925, 'player_b_id': 210530, 'surface': 'Hard'},
    {'player_a_id': 207989, 'player_b_id': 206173, 'surface': 'Clay'},
    # Add more matches...
]

# Make predictions
predictions_df = predict_match(rf_model, df_with_stats, tournament_matches)
print("Tournament Predictions:")
display(predictions_df)

# Function to get player names (if you have a player lookup table)
def add_player_names(predictions_df, player_lookup):
    """
    Add player names to predictions DataFrame
    player_lookup should be a dict: {player_id: player_name}
    """
    predictions_df['player_a_name'] = predictions_df['player_a_id'].map(player_lookup)
    predictions_df['player_b_name'] = predictions_df['player_b_id'].map(player_lookup)
    predictions_df['predicted_winner_name'] = predictions_df['predicted_winner'].map(player_lookup)
    return predictions_df

# Create player lookup from your data
player_lookup = dict(zip(df['winner_id'], df['winner_name']))
player_lookup.update(dict(zip(df['loser_id'], df['loser_name'])))

# Add names to predictions
predictions_with_names = add_player_names(predictions_df.copy(), player_lookup)
print("\nPredictions with Player Names:")
display(predictions_with_names[['player_a_name', 'player_b_name', 'predicted_winner_name', 'confidence']])

Tournament Predictions:


Unnamed: 0,player_a_id,player_b_id,surface,predicted_winner,player_a_win_prob,player_b_win_prob,confidence
0,206173,200240,Hard,206173,0.89267,0.10733,0.89267
1,104925,210530,Hard,104925,0.902378,0.097622,0.902378
2,207989,206173,Clay,206173,0.360494,0.639506,0.639506



Predictions with Player Names:


Unnamed: 0,player_a_name,player_b_name,predicted_winner_name,confidence
0,Jannik Sinner,Vit Kopriva,Jannik Sinner,0.89267
1,Novak Djokovic,Learner Tien,Novak Djokovic,0.902378
2,Carlos Alcaraz,Jannik Sinner,Jannik Sinner,0.639506


In [131]:
# Create id lookup
id_lookup = dict(zip(df['winner_name'], df['winner_id']))
id_lookup.update(dict(zip(df['loser_name'], df['loser_id'])))

def reverse_name(name: str) -> str:
    """
    Reverses a name in format (Last, First) to (First Last)
    Also ensures that the first letters of both names are uppercase and the rest lowercase.
    reverse_name("Alcaraz, Carlos") --> Carlos Alcaraz
    reverse_name("NADAL, RAFAEL") --> Rafael Nadal
    reverse_name("Federer, roger") --> Roger Federer
    """
    parts = [part.strip() for part in name.split(",")]
    if len(parts) == 2:
        first, last = parts[1].title(), parts[0].title()
        return f"{first} {last}"
    return name.title()  # fallback if not in "Last, First" format

A_bracket = [
    ["Jannik Sinner", "Vit Kopriva"],
    ["Alexei Popyrin", "Emil Ruusuvuori"],
    ["Stan Wawrinka", "Bu Yunchaokete"], # TEMPORARY
    #["Valentin Royer", "Bu Yunchaokete"],
    ["Marton Fucsovics", "Denis Shapovalov"],
    ["Alexander Bublik", "Marin Cilic"],
    ["Lorenzo Sonego", "Tristan Schoolkate"],
    ["Nuno Borges", "Brandon Holt"],
    ["Elmer Moller", "Tommy Paul"],
    ["Lorenzo Musetti", "Giovanni Mpetshi Perricard"],
    ["Quentin Halys", "David Goffin"],
    ["Jenson Brooksby", "Aleksandar Vukic"],
    ["Francesco Passaro", "Flavio Cobolli"],
    ["Gabriel Diallo", "Damir Dzumhur"],
    ["Jaume Munar", "Jaime Faria"],
    ["Zizou Bergs", "Chun Hsin Tseng"],
    ["Federico Agustin Gomez", "Jack Draper"],
    ["Alexander Zverev", "Alejandro Tabilo"],
    ["Roberto Bautista Agut", "Jacob Fearnley"],
    ["Gael Monfils", "Roman Safiullin"],
    ["Billy Harris", "Felix Auger Aliassime"],
    ["Ugo Humbert", "Adam Walton"],
    ["Aleksandar Kovacevic", "Coleman Wong"],
    ["James Duckworth", "Dominik Koepfer"], # TEMPORARY
    #["James Duckworth", "Tristan Boyer"],
    ["Dino Prizmic", "Andrey Rublev"],
    ["Karen Khachanov", "Nishesh Basavareddy"],
    ["Hugo Dellien", "Kamil Majchrzak"],
    ["Leandro Riedi", "Pedro Martinez"],
    ["Matteo Arnaldi", "Francisco Cerundolo"],
    ["Stefanos Tsitsipas", "Alexandre Muller"],
    ["Daniel Altmaier", "Hamad Medjedovic"],
    ["Hugo Gaston", "Shintaro Mochizuki"],
    ["Christopher Oconnell", "Alex De Minaur"],
]
    
B_bracket = [
    ["Novak Djokovic", "Learner Tien"],
    ["Zachary Svajda", "Zsombor Piros"],
    ["Cameron Norrie", "Sebastian Korda"],
    ["Francisco Comesana", "Alex Michelsen"],
    ["Frances Tiafoe", "Yoshihito Nishioka"],
    ["Martin Damm", "Darwin Blanch"],
    ["Jan Lennard Struff", "Mackenzie Mcdonald"],
    ["Botic Van De Zandschulp", "Holger Rune"],
    ["Jakub Mensik", "Nicolas Jarry"],
    ["Ugo Blanchet", "Fabian Marozsan"],
    ["Joao Fonseca", "Miomir Kecmanovic"],
    ["Luca Nardi", "Tomas Machac"],
    ["Brandon Nakashima", "Jesper De Jong"],
    ["Jerome Kym", "Ethan Quinn"],
    ["Sebastian Baez", "Lloyd Harris"],
    ["Emilio Nava", "Taylor Fritz"],
    ["Ben Shelton", "Ignacio Buse"],
    ["Pablo Carreno Busta", "Pablo Llamas Ruiz"],
    ["Jordan Thompson", "Corentin Moutet"],
    ["Adrian Mannarino", "Tallon Griekspoor"],
    ["Jiri Lehecka", "Borna Coric"],
    ["Camilo Ugo Carabelli", "Tomas Martin Etcheverry"],
    ["Daniel Elahi Galan", "Raphael Collignon"],
    ["Sebastian Ofner", "Casper Ruud"],
    ["Daniil Medvedev", "Benjamin Bonzi"],
    ["Mariano Navone", "Marcos Giron"],
    ["Roberto Carballes Baena", "Arthur Rinderknech"],
    ["Alexander Shevchenko", "Alejandro Davidovich Fokina"],
    ["Luciano Darderi", "Rinky Hijikata"],
    ["Pavel Kotov", "Eliot Spizzirri"], # TEMPORARY
    #["Stefan Dostanic", "Eliot Spizzirri"],
    ["Mattia Bellucci", "Juncheng Shang"],
    ["Reilly Opelka", "Carlos Alcaraz"],
]

def convert_brackets_to_ids(name_brackets, id_lookup):
    """
    Convert name brackets to ID brackets using the id_lookup table
    
    Parameters:
    - name_brackets: list of lists containing player names
    - id_lookup: dictionary mapping player names to player IDs
    
    Returns:
    - list of lists containing player IDs
    """
    bracket_ids = []
    for match in name_brackets:
        match_ids = []
        for player_name in match:
            player_id = id_lookup.get(player_name)
            if player_id is None:
                print(f"Warning: Player '{player_name}' not found in lookup table")
            match_ids.append(player_id)
        bracket_ids.append(match_ids)
    return bracket_ids

# Convert both brackets using the function
A_bracket_ids = convert_brackets_to_ids(A_bracket, id_lookup)
B_bracket_ids = convert_brackets_to_ids(B_bracket, id_lookup)
full_bracket_ids = A_bracket_ids + B_bracket_ids

print("A_bracket_ids:", A_bracket_ids)
print("B_bracket_ids:", B_bracket_ids)
print(full_bracket_ids)

A_bracket_ids: [[206173, 200240], [200615, 200325], [104527, 207352], [105916, 133430], [122330, 105227], [132283, 209262], [132686, 200587], [209284, 126205], [207518, 208659], [111460, 105676], [202385, 126846], [208859, 207925], [209113, 106000], [144719, 210262], [200267, 202358], [144870, 207733], [100644, 126214], [105138, 207985], [104792, 126128], [126129, 200000], [200005, 200443], [206499, 209409], [105902, 136440], [209976, 126094], [111575, 210460], [106198, 111794], [209857, 124079], [208286, 202103], [126774, 124186], [127157, 209098], [200384, 208278], [106331, 200282]]
B_bracket_ids: [[104925, 210530], [208260, 200436], [111815, 200624], [207681, 210506], [126207, 106415], [102093, 210464], [105526, 111456], [122298, 208029], [210150, 111797], [200259, 206681], [211663, 200175], [208134, 207830], [206909, 207411], [208843, 210319], [202104, 144750], [207182, 126203], [210097, 209860], [105807, 208010], [111442, 144895], [105173, 134868], [208103, 106432], [200116, 14486

In [132]:
def simulate_full_tournament(model, df_with_stats, tournament_draw, surface='Hard', tournament_name="Tournament"):
    """
    Simulate a complete tournament from first round to final
    
    Parameters:
    - model: trained sklearn pipeline
    - df_with_stats: historical data with player stats
    - tournament_draw: list of player_ids in bracket order (must be power of 2)
    - surface: tournament surface
    - tournament_name: name for display
    
    Returns:
    - dict with results for each round and overall winner
    """
    import math
    
    # Validate draw size is power of 2
    if len(tournament_draw) & (len(tournament_draw) - 1) != 0:
        raise ValueError(f"Tournament draw must be power of 2. Got {len(tournament_draw)} players.")
    
    num_rounds = int(math.log2(len(tournament_draw)))
    print(f"=== {tournament_name} Simulation ===")
    print(f"Players: {len(tournament_draw)}, Rounds: {num_rounds}")
    print(f"Surface: {surface}\n")
    
    tournament_results = {}
    current_players = tournament_draw.copy()
    
    # Create player lookup for names
    player_lookup = dict(zip(df['winner_id'], df['winner_name']))
    player_lookup.update(dict(zip(df['loser_id'], df['loser_name'])))
    
    for round_num in range(1, num_rounds + 1):
        # Determine round name
        remaining_players = len(current_players)
        if remaining_players == 2:
            round_name = "Final"
        elif remaining_players == 4:
            round_name = "Semifinals"
        elif remaining_players == 8:
            round_name = "Quarterfinals"
        elif remaining_players == 16:
            round_name = "Round of 16"
        elif remaining_players == 32:
            round_name = "Round of 32"
        elif remaining_players == 64:
            round_name = "Round of 64"
        elif remaining_players == 128:
            round_name = "Round of 128"
        else:
            round_name = f"Round {round_num}"
        
        print(f"=== {round_name} ({remaining_players} players) ===")
        
        # Create matches for this round
        round_matches = []
        for i in range(0, len(current_players), 2):
            round_matches.append({
                'player_a_id': current_players[i],
                'player_b_id': current_players[i + 1],
                'surface': surface
            })
        
        # Predict matches for this round
        round_predictions = predict_match(model, df_with_stats, round_matches)
        
        # Display results with player names
        winners = []
        for _, match in round_predictions.iterrows():
            player_a_name = player_lookup.get(match['player_a_id'], f"Player {match['player_a_id']}")
            player_b_name = player_lookup.get(match['player_b_id'], f"Player {match['player_b_id']}")
            winner_name = player_lookup.get(match['predicted_winner'], f"Player {match['predicted_winner']}")
            
            print(f"{player_a_name} vs {player_b_name}")
            print(f"  → Winner: {winner_name} (confidence: {match['confidence']:.3f})")
            print(f"  → Probabilities: {player_a_name} {match['player_a_win_prob']:.3f}, {player_b_name} {match['player_b_win_prob']:.3f}")
            print()
            
            winners.append(match['predicted_winner'])
        
        # Store round results
        tournament_results[round_name] = {
            'predictions': round_predictions,
            'winners': winners,
            'round_number': round_num
        }
        
        # Winners advance to next round
        current_players = winners
        
        # If final, break
        if len(current_players) == 1:
            break
    
    # Determine overall winner
    champion = current_players[0]
    champion_name = player_lookup.get(champion, f"Player {champion}")
    
    print(f"🏆 TOURNAMENT WINNER: {champion_name} (ID: {champion}) 🏆")
    
    tournament_results['champion'] = {
        'player_id': champion,
        'player_name': champion_name
    }
    
    return tournament_results

def create_manual_tournament_draw(first_round_matches):
    """
    Create a tournament draw from manually specified first round matches
    
    Parameters:
    - first_round_matches: list of tuples/lists with (player_a_id, player_b_id) for each first round match
    
    Returns:
    - list of player IDs in bracket order for tournament simulation
    """
    
    # Validate that we have a power of 2 number of players
    total_players = len(first_round_matches) * 2
    if total_players & (total_players - 1) != 0:
        raise ValueError(f"Total players ({total_players}) must be a power of 2. Got {len(first_round_matches)} matches.")
    
    print(f"Creating manual tournament draw with {len(first_round_matches)} first round matches ({total_players} players)")
    
    # Flatten the matches into bracket order
    tournament_draw = []
    for match in first_round_matches:
        tournament_draw.extend([match[0], match[1]])
    
    return tournament_draw

def input_manual_tournament():
    """
    Interactive function to input tournament pairings manually
    
    Returns:
    - tournament draw list
    """
    print("=== Manual Tournament Draw Input ===")
    print("Enter first round matches. Type 'done' when finished.")
    print("Format: player_a_id,player_b_id")
    print("Example: 104925,104745")
    print()
    
    matches = []
    match_num = 1
    
    while True:
        user_input = input(f"Match {match_num}: ").strip()
        
        if user_input.lower() == 'done':
            break
            
        try:
            player_a, player_b = map(int, user_input.split(','))
            matches.append((player_a, player_b))
            print(f"  Added: {player_a} vs {player_b}")
            match_num += 1
        except ValueError:
            print("  Invalid format. Use: player_a_id,player_b_id")
    
    if not matches:
        print("No matches entered.")
        return []
    
    # Check if power of 2
    total_players = len(matches) * 2
    if total_players & (total_players - 1) != 0:
        print(f"Warning: {total_players} players is not a power of 2. Tournament simulation may fail.")
    
    return create_manual_tournament_draw(matches)

# Example usage with predefined matches
def example_manual_tournament(draw):
    """
    Example of creating a manual tournament with specific first round pairings
    """
    
    # Create tournament draw
    tournament_draw = create_manual_tournament_draw(full_bracket_ids)
    
    # Display the matchups with player names
    player_lookup = dict(zip(df['winner_id'], df['winner_name']))
    player_lookup.update(dict(zip(df['loser_id'], df['loser_name'])))
    
    print("\nRound 1 Matchups:")
    print("=" * 50)
    for i, (p1, p2) in enumerate(draw, 1):
        name1 = player_lookup.get(p1, f"Player {p1}")
        name2 = player_lookup.get(p2, f"Player {p2}")
        print(f"Match {i}: {name1} vs {name2}")
    
    return tournament_draw

# Run example
manual_draw = example_manual_tournament(full_bracket_ids)

# Simulate the manually created tournament
if manual_draw:
    results = simulate_full_tournament(
        model=rf_model,
        df_with_stats=df_with_stats,
        tournament_draw=manual_draw,
        surface='Hard',
        tournament_name="Custom Manual Tournament"
    )


'''
def create_tournament_bracket(player_ids, seeded=True):
    """
    Create a tournament bracket from player IDs
    
    Parameters:
    - player_ids: list of player IDs
    - seeded: if True, assumes list is in seeding order and creates proper bracket
    
    Returns:
    - list of player IDs in bracket order
    """
    
    n = len(player_ids)
    
    # Pad to next power of 2 if needed
    if n & (n - 1) != 0:
        next_power = 2 ** math.ceil(math.log2(n))
        print(f"Padding from {n} to {next_power} players with byes")
        player_ids.extend([None] * (next_power - n))  # None represents a bye
    
    if not seeded:
        return player_ids
    
    # Create proper seeded bracket (1 vs lowest, 2 vs second-lowest, etc.)
    def create_bracket_recursive(seeds):
        if len(seeds) <= 2:
            return seeds
        
        # Split into top and bottom halves
        mid = len(seeds) // 2
        top_half = seeds[:mid]
        bottom_half = seeds[mid:]
        bottom_half.reverse()  # Reverse bottom half for proper seeding
        
        # Recursively create brackets for each half
        top_bracket = create_bracket_recursive(top_half)
        bottom_bracket = create_bracket_recursive(bottom_half)
        
        return top_bracket + bottom_bracket
    
    return create_bracket_recursive(player_ids)


def get_player_rankings(df_with_stats, top_n=64):
    """
    Get top N players by recent performance
    
    Parameters:
    - df_with_stats: historical data
    - top_n: number of top players to return
    
    Returns:
    - list of player IDs in ranking order
    """
    # Get recent performance for all players
    recent_stats = []
    
    # Get latest stats for each player
    for player_id in df_with_stats['winner_id'].unique():
        player_matches = df_with_stats[
            (df_with_stats['winner_id'] == player_id) | 
            (df_with_stats['loser_id'] == player_id)
        ].tail(20)  # Last 20 matches
        
        if len(player_matches) >= 10:  # Minimum matches for ranking
            wins = len(player_matches[player_matches['winner_id'] == player_id])
            win_pct = wins / len(player_matches)
            recent_stats.append({
                'player_id': player_id,
                'matches': len(player_matches),
                'wins': wins,
                'win_pct': win_pct
            })
    
    # Sort by win percentage and total matches
    rankings = sorted(recent_stats, key=lambda x: (x['win_pct'], x['matches']), reverse=True)
    
    return [p['player_id'] for p in rankings[:top_n]]

# Example usage - Create and simulate a 32-player tournament
print("Creating tournament with top 32 players...")
top_players = get_player_rankings(df_with_stats, top_n=32)

# Create proper seeded bracket
tournament_bracket = create_tournament_bracket(top_players, seeded=True)

# Remove any None values (byes) for this example
tournament_bracket = [p for p in tournament_bracket if p is not None]

# Simulate the tournament

results = simulate_full_tournament(
    model=rf_model,
    df_with_stats=df_with_stats,
    tournament_draw=tournament_bracket,
    surface='Hard',
    tournament_name="ATP Masters 1000"
)

# Display summary
print("\n" + "="*50)
print("TOURNAMENT SUMMARY")
print("="*50)
for round_name, round_data in results.items():
    if round_name != 'champion':
        print(f"{round_name}: {len(round_data['winners'])} winners")

print(f"\n🏆 : {results['champion']['player_name']}")
'''

Creating manual tournament draw with 64 first round matches (128 players)

Round 1 Matchups:
Match 1: Jannik Sinner vs Vit Kopriva
Match 2: Alexei Popyrin vs Emil Ruusuvuori
Match 3: Stan Wawrinka vs Bu Yunchaokete
Match 4: Marton Fucsovics vs Denis Shapovalov
Match 5: Alexander Bublik vs Marin Cilic
Match 6: Lorenzo Sonego vs Tristan Schoolkate
Match 7: Nuno Borges vs Brandon Holt
Match 8: Elmer Moller vs Tommy Paul
Match 9: Lorenzo Musetti vs Giovanni Mpetshi Perricard
Match 10: Quentin Halys vs David Goffin
Match 11: Jenson Brooksby vs Aleksandar Vukic
Match 12: Francesco Passaro vs Flavio Cobolli
Match 13: Gabriel Diallo vs Damir Dzumhur
Match 14: Jaume Munar vs Jaime Faria
Match 15: Zizou Bergs vs Chun Hsin Tseng
Match 16: Federico Agustin Gomez vs Jack Draper
Match 17: Alexander Zverev vs Alejandro Tabilo
Match 18: Roberto Bautista Agut vs Jacob Fearnley
Match 19: Gael Monfils vs Roman Safiullin
Match 20: Billy Harris vs Felix Auger Aliassime
Match 21: Ugo Humbert vs Adam Walton


'\ndef create_tournament_bracket(player_ids, seeded=True):\n    """\n    Create a tournament bracket from player IDs\n    \n    Parameters:\n    - player_ids: list of player IDs\n    - seeded: if True, assumes list is in seeding order and creates proper bracket\n    \n    Returns:\n    - list of player IDs in bracket order\n    """\n    \n    n = len(player_ids)\n    \n    # Pad to next power of 2 if needed\n    if n & (n - 1) != 0:\n        next_power = 2 ** math.ceil(math.log2(n))\n        print(f"Padding from {n} to {next_power} players with byes")\n        player_ids.extend([None] * (next_power - n))  # None represents a bye\n    \n    if not seeded:\n        return player_ids\n    \n    # Create proper seeded bracket (1 vs lowest, 2 vs second-lowest, etc.)\n    def create_bracket_recursive(seeds):\n        if len(seeds) <= 2:\n            return seeds\n        \n        # Split into top and bottom halves\n        mid = len(seeds) // 2\n        top_half = seeds[:mid]\n        botto