In [44]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Display settings for Jupyter notebooks (if applicable)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')

# Statistical tests and modeling
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Optional: Machine learning setup
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score

# Optional: Working with SQL
import sqlite3


In [2]:
atp = pd.read_csv('atp_tennis.csv')
atp.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [12]:
# 1. Calculate head-to-head difference for all matches (pre-match)

def get_h2h_stats(row, history_df):
    p1 = row['Player_1']
    p2 = row['Player_2']

    h2h = history_df[
        ((history_df['Player_1'] == p1) & (history_df['Player_2'] == p2)) |
        ((history_df['Player_1'] == p2) & (history_df['Player_2'] == p1))
    ]

    total_matches = len(h2h)
    if total_matches == 0:
        return pd.Series([0, 0, 0, 0.0])

    p1_wins = sum(h2h['Winner'] == p1)
    avg_sets = h2h['Score'].apply(lambda s: len(str(s).split()) if pd.notnull(s) else 0).mean()
    last_match = h2h.sort_values('Date').iloc[-1]
    p1_last_win = 1 if last_match['Winner'] == p1 else 0

    return pd.Series([total_matches, p1_wins, p1_last_win, avg_sets],
                     index=['h2h_matches', 'h2h_wins_p1', 'h2h_last_win', 'h2h_sets_avg'])

# 2. Compute Grand Slam finals won per player (all time)

gs_finals = atp[(atp['Round'] == 'F') & (atp['Series'] == 'Grand Slam')]
gs_final_wins = gs_finals['Winner'].value_counts().to_dict()

atp['gs_finals_won_1'] = atp['Player_1'].map(gs_final_wins).fillna(0)
atp['gs_finals_won_2'] = atp['Player_2'].map(gs_final_wins).fillna(0)
atp['gs_final_diff'] = atp['gs_finals_won_1'] - atp['gs_finals_won_2']

In [19]:
# Filter to recent matches and best-of-5 only
atp_bo5 = atp[(atp['Date'] >= '2024-01-01') & (atp['Best of'] == 5)].copy()

# Target: Number of sets played
atp_bo5['num_sets'] = atp_bo5['Score'].str.count(r'\d+-\d+')

# Target 2: Winner is Player_1 (1) or Player_2 (0)
atp_bo5['target_win'] = (atp_bo5['Winner'] == atp_bo5['Player_1']).astype(int)
# Add H2H features
h2h_features = atp_bo5.apply(lambda row: get_h2h_stats(row, atp[atp['Date'] < row['Date']]), axis=1)
atp_bo5 = pd.concat([atp_bo5, h2h_features], axis=1)
# After adding H2H features to your dataset
atp_bo5[['h2h_matches', 'h2h_wins_p1', 'h2h_last_win', 'h2h_sets_avg']] = \
    atp_bo5[['h2h_matches', 'h2h_wins_p1', 'h2h_last_win', 'h2h_sets_avg']].fillna(0)

# Drop matches without a valid score
atp_bo5 = atp_bo5[atp_bo5['num_sets'].isin([3, 4, 5])]

In [20]:
# Feature engineering
atp_bo5['rank_diff'] = atp_bo5['Rank_2'] - atp_bo5['Rank_1']
atp_bo5['pts_diff'] = atp_bo5['Pts_1'] - atp_bo5['Pts_2']
atp_bo5['log_odds_ratio'] = np.log(atp_bo5['Odd_2'] / atp_bo5['Odd_1'])

In [36]:
# Select features
features = [
    'rank_diff', 'pts_diff', 'log_odds_ratio',
    'Surface', 'Round', 'Series',
    'h2h_matches', 'h2h_wins_p1', 'h2h_last_win', 'h2h_sets_avg'
]
X = atp_bo5[features]

# 6. One-hot encode categoricals

X = pd.get_dummies(X, columns=['Surface', 'Round', 'Series'], drop_first=True)
model_features = X.columns.tolist()  # <-- This defines model_features for future use
# 7. Prepare targets

y_win = atp_bo5['target_win']
y_sets = atp_bo5['num_sets']

In [37]:
atp_bo5.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,h2h_diff,gs_finals_won_1,gs_finals_won_2,gs_final_diff,num_sets,target_win,0,1,2,3,h2h_last_win,h2h_matches,h2h_sets_avg,h2h_wins_p1,rank_diff,pts_diff,log_odds_ratio
61668,Australian Open,2024-01-14,Grand Slam,Outdoor,Hard,1st Round,5,Arnaldi M.,Walton A.,Arnaldi M.,41,174,1077,354,1.2,4.5,7-6 6-2 6-4,0,0.0,0.0,0.0,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133,723,1.32
61669,Australian Open,2024-01-14,Grand Slam,Outdoor,Hard,1st Round,5,Shevchenko A.,Munar J.,Munar J.,48,82,975,706,1.36,3.2,3-6 3-6 1-6,0,0.0,0.0,0.0,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34,269,0.86
61670,Australian Open,2024-01-14,Grand Slam,Outdoor,Hard,1st Round,5,Sinner J.,Van De Zandschulp B.,Sinner J.,4,59,6490,890,1.06,10.0,6-4 7-5 6-3,0,0.0,0.0,0.0,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55,5600,2.24
61671,Australian Open,2024-01-14,Grand Slam,Outdoor,Hard,1st Round,5,Rinderknech A.,Kotov P.,Kotov P.,94,65,629,835,2.3,1.62,5-7 1-6 7-6 7-6 3-6,0,0.0,0.0,0.0,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-29,-206,-0.35
61672,Australian Open,2024-01-14,Grand Slam,Outdoor,Hard,1st Round,5,Cerundolo F.,Sweeny D.,Cerundolo F.,21,257,1760,228,1.36,3.2,3-6 6-3 6-4 2-6 6-2,0,0.0,0.0,0.0,5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,236,1532,0.86


In [45]:
# 8. Train-test split

from sklearn.model_selection import train_test_split

X_train_win, X_test_win, y_train_win, y_test_win = train_test_split(X, y_win, test_size=0.2, random_state=42)
X_train_sets, X_test_sets, y_train_sets, y_test_sets = train_test_split(X, y_sets, test_size=0.2, random_state=42)

# 9. Train models

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model_win = LogisticRegression(max_iter=1000)
model_win.fit(X_train_win, y_train_win)

# model_sets = LogisticRegression(multi_class='multinomial', max_iter=1000)
# model_sets.fit(X_train_sets, y_train_sets)


# Set count prediction model
model_sets = RandomForestClassifier(n_estimators=200, random_state=42)
model_sets.fit(X_train_sets, y_train_sets)
y_pred_sets = model_sets.predict(X_test_sets)

print("Set Count Prediction")
print("Confusion Matrix:\n", confusion_matrix(y_test_sets, y_pred_sets))
print("\nClassification Report:\n", classification_report(y_test_sets, y_pred_sets))
# 10. Evaluate

print("Match Winner Prediction Report:")
print(confusion_matrix(y_test_win, model_win.predict(X_test_win)))
print(classification_report(y_test_win, model_win.predict(X_test_win)))

print("\nNumber of Sets Prediction Report:")
print(confusion_matrix(y_test_sets, model_sets.predict(X_test_sets)))
print(classification_report(y_test_sets, model_sets.predict(X_test_sets)))

Set Count Prediction
Confusion Matrix:
 [[33 13 13]
 [26 22  5]
 [16 10  6]]

Classification Report:
               precision    recall  f1-score   support

           3       0.44      0.56      0.49        59
           4       0.49      0.42      0.45        53
           5       0.25      0.19      0.21        32

    accuracy                           0.42       144
   macro avg       0.39      0.39      0.39       144
weighted avg       0.42      0.42      0.41       144

Match Winner Prediction Report:
[[49 14]
 [25 56]]
              precision    recall  f1-score   support

           0       0.66      0.78      0.72        63
           1       0.80      0.69      0.74        81

    accuracy                           0.73       144
   macro avg       0.73      0.73      0.73       144
weighted avg       0.74      0.73      0.73       144


Number of Sets Prediction Report:
[[33 13 13]
 [26 22  5]
 [16 10  6]]
              precision    recall  f1-score   support

           3

In [46]:
today_matches = pd.DataFrame([
    {
        'Date': pd.Timestamp('2025-07-11'),  # Match date, important for H2H calculation
        'Player_1': 'Carlos Alcaraz',
        'Player_2': 'Taylor Fritz',
        'Rank_1': 2,
        'Rank_2': 5,
        'Pts_1': 9300,
        'Pts_2': 4635,
        'Odd_1': 1 + 1/6,    # Decimal odds conversion from fractional 1/6 ≈ 1.1667
        'Odd_2': 1 + 4/1,    # 4/1 fractional → 5.0 decimal
        'Surface': 'Grass',
        'Round': 'Semifinal',
        'Series': 'Grand Slam'
    },
    {
        'Date': pd.Timestamp('2025-07-11'),
        'Player_1': 'Jannik Sinner',
        'Player_2': 'Novak Djokovic',
        'Rank_1': 1,
        'Rank_2': 6,
        'Pts_1': 10430,
        'Pts_2': 4630,
        'Odd_1': 1 + 2/5,    # 2/5 fractional → 1.4 decimal
        'Odd_2': 1 + 2/1,    # 2/1 fractional → 3.0 decimal
        'Surface': 'Grass',
        'Round': 'Semifinal',
        'Series': 'Grand Slam'
    },
    {
        'Date': pd.Timestamp('2024-07-13'),
        'Player_1': 'Jannik Sinner',
        'Player_2': 'Carlos Alcaraz',
        'Rank_1': 1,
        'Rank_2': 2,
        'Pts_1': 10430,
        'Pts_2': 9300,
        'Odd_1': 1.9,  # Example decimal odds for Sinner
        'Odd_2': 2.0,  # Example decimal odds for Alcaraz
        'Surface': 'Grass',
        'Round': 'Final',
        'Series': 'Grand Slam'
}
])

In [47]:
def prepare_today_matches(df, full_data, model_features):
    df = df.copy()

    df['rank_diff'] = df['Rank_2'] - df['Rank_1']
    df['pts_diff'] = df['Pts_1'] - df['Pts_2']
    df['log_odds_ratio'] = np.log(df['Odd_2'] / df['Odd_1'])

    h2h_feats = df.apply(lambda row: get_h2h_stats(row, full_data), axis=1)
    df = pd.concat([df, h2h_feats], axis=1)

    df = pd.get_dummies(df, columns=['Surface', 'Round', 'Series'], drop_first=True)

    # Align with training features
    missing_cols = [col for col in model_features if col not in df.columns]
    for col in missing_cols:
        df[col] = 0

    return df[model_features]

    
    # 2. Calculate GS finals won per player (aggregate once outside this function ideally)
    gs_finals = atp_history[(atp_history['Round'] == 'F') & (atp_history['Series'] == 'Grand Slam')]
    gs_final_wins = gs_finals['Winner'].value_counts().to_dict()
    
    today_matches['gs_finals_won_1'] = today_matches['Player_1'].map(gs_final_wins).fillna(0)
    today_matches['gs_finals_won_2'] = today_matches['Player_2'].map(gs_final_wins).fillna(0)
    today_matches['gs_final_diff'] = today_matches['gs_finals_won_1'] - today_matches['gs_finals_won_2']
    
    # 3. Calculate other numeric features
    today_matches['rank_diff'] = today_matches['Rank_2'] - today_matches['Rank_1']
    today_matches['pts_diff'] = today_matches['Pts_1'] - today_matches['Pts_2']
    today_matches['log_odds_ratio'] = np.log(today_matches['Odd_2'] / today_matches['Odd_1'])
    
    # 4. Select feature columns before encoding
    feature_cols = ['rank_diff', 'pts_diff', 'log_odds_ratio', 'h2h_diff', 'gs_final_diff',
                    'Surface', 'Round', 'Series']
    X = today_matches[feature_cols].copy()
    
    # 5. One-hot encode categoricals (must match training)
    X_encoded = pd.get_dummies(X, columns=['Surface', 'Round', 'Series'], drop_first=True)
    
    # 6. Add missing columns with 0 (to match model's features exactly)
    for col in model_features:
        if col not in X_encoded.columns:
            X_encoded[col] = 0
            
    # 7. Remove any extra columns not in model_features
    X_ready = X_encoded[model_features].copy()
    
    return X_ready


In [48]:
atp['Date'] = pd.to_datetime(atp['Date'])
today_matches['Date'] = pd.to_datetime(today_matches['Date'])

In [49]:
# Prepare features for today's matches
X_today = prepare_today_matches(today_matches, atp, model_features)

# Fill any missing values in H2H features with 0
h2h_cols = ['h2h_matches', 'h2h_wins_p1', 'h2h_last_win', 'h2h_sets_avg']
for col in h2h_cols:
    if col in X_today.columns:
        X_today[col] = X_today[col].fillna(0)

# Predict winner probabilities (probability Player_1 wins)
win_probs = model_win.predict_proba(X_today)[:, 1]

# Predict number of sets
sets_pred = model_sets.predict(X_today)

# Nicely print results
for i, row in today_matches.iterrows():
    p1 = row['Player_1']
    p2 = row['Player_2']
    prob_p1_win = win_probs[i] * 100  # convert to %
    sets = sets_pred[i]

    # Predicted winner and probability
    if prob_p1_win > 50:
        predicted_winner = p1
        prob_winner = prob_p1_win
    else:
        predicted_winner = p2
        prob_winner = 100 - prob_p1_win

    print(f"Match: {p1} vs {p2}")
    print(f"Predicted winner: {predicted_winner} with probability {prob_winner:.1f}%")
    print(f"Predicted number of sets: {sets}")
    print("-" * 40)


Match: Carlos Alcaraz vs Taylor Fritz
Predicted winner: Carlos Alcaraz with probability 89.5%
Predicted number of sets: 3
----------------------------------------
Match: Jannik Sinner vs Novak Djokovic
Predicted winner: Jannik Sinner with probability 81.7%
Predicted number of sets: 3
----------------------------------------
Match: Jannik Sinner vs Carlos Alcaraz
Predicted winner: Jannik Sinner with probability 51.1%
Predicted number of sets: 4
----------------------------------------


In [43]:
# # Same feature transformations as in training
# today_matches['rank_diff'] = today_matches['Rank_2'] - today_matches['Rank_1']
# today_matches['pts_diff'] = today_matches['Pts_1'] - today_matches['Pts_2']
# today_matches['log_odds_ratio'] = np.log(today_matches['Odd_2'] / today_matches['Odd_1'])


In [None]:
# # Must match training data structure
# today_matches_encoded = pd.get_dummies(today_matches[['rank_diff', 'pts_diff', 'log_odds_ratio', 'Surface', 'Round', 'Series']], drop_first=True)

# # Align columns with training data
# missing_cols = set(X.columns) - set(today_matches_encoded.columns)
# for col in missing_cols:
#     today_matches_encoded[col] = 0
# today_matches_encoded = today_matches_encoded[X.columns]  # Ensure correct order


In [None]:
# today_matches['Player_1'] = ['Carlos Alcaraz', 'Jannik Sinner']
# today_matches['Player_2'] = ['Taylor Fritz', 'Novak Djokovic']
# # Predict match winner (1 = Player_1 wins)
# today_matches['pred_win'] = model_win.predict(today_matches_encoded)
# today_matches['win_proba'] = model_win.predict_proba(today_matches_encoded)[:, 1]  # P(Player_1 wins)

# # Predict number of sets
# today_matches['pred_num_sets'] = model_sets.predict(today_matches_encoded)

In [None]:
# print(today_matches[['Player_1', 'Player_2', 'pred_win', 'win_proba', 'pred_num_sets']])