In [85]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import joblib

In [86]:
# Load the data

# Get all Excel files
excel_files = glob.glob('../data/*.xlsx')  

dfs = []

# Read each Excel file and append to the list
for file in sorted(excel_files): 
    df = pd.read_excel(file)
    dfs.append(df)

# Concatenate all dataframes
tennis_df = pd.concat(dfs, ignore_index=True)

tennis_df.head()


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,LBW,LBL,PSW,PSL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Brisbane,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3.0,Mayer F.,...,1.44,2.62,1.47,2.85,1.44,2.63,1.47,3.2,1.42,2.78
1,1,Brisbane,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3.0,Nieminen J.,...,1.8,1.91,1.8,2.1,1.73,2.0,1.8,2.26,1.73,2.05
2,1,Brisbane,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3.0,Nishikori K.,...,1.29,3.5,1.3,3.85,1.3,3.2,1.3,4.2,1.28,3.58
3,1,Brisbane,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3.0,Baghdatis M.,...,1.08,7.0,1.08,9.43,1.07,7.0,1.1,9.5,1.08,7.76
4,1,Brisbane,Brisbane International,2013-01-01,ATP250,Outdoor,Hard,1st Round,3.0,Istomin D.,...,1.91,1.8,1.88,2.0,1.91,1.8,2.05,2.0,1.88,1.85


In [87]:
tennis_df.shape

(30161, 42)

In [88]:
tennis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 42 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         30161 non-null  int64         
 1   Location    30161 non-null  object        
 2   Tournament  30161 non-null  object        
 3   Date        30161 non-null  datetime64[ns]
 4   Series      30161 non-null  object        
 5   Court       30161 non-null  object        
 6   Surface     30161 non-null  object        
 7   Round       30161 non-null  object        
 8   Best of     30146 non-null  float64       
 9   Winner      30161 non-null  object        
 10  Loser       30161 non-null  object        
 11  WRank       30149 non-null  float64       
 12  LRank       30098 non-null  float64       
 13  WPts        30150 non-null  float64       
 14  LPts        30098 non-null  float64       
 15  W1          29958 non-null  float64       
 16  L1          29961 non-

In [89]:
tennis_df.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW',
       'PSL', 'SJW', 'SJL', 'MaxW', 'MaxL', 'AvgW', 'AvgL'],
      dtype='object')

In [90]:
# Preprocessing
tennis_df = tennis_df.drop(columns=['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court',
       'Round', 'Best of', 'Winner', 'Loser',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW',
       'PSL', 'SJW', 'SJL', 'MaxW', 'MaxL', 'AvgW', 'AvgL'])

print(f"Remaining columns: {tennis_df.columns}")
print(f"Number of rows: {tennis_df.shape[0]}")

# Encode categorical variables
tennis_df['Surface'] = tennis_df['Surface'].map({'Hard': 0, 'Clay': 1, 'Grass': 2})

Remaining columns: Index(['Surface', 'WRank', 'LRank', 'WPts', 'LPts', 'B365W', 'B365L'], dtype='object')
Number of rows: 30161


In [91]:
# Remove rows with missing values
tennis_df = tennis_df.dropna()
print(f"Number of rows after removing missing values: {tennis_df.shape[0]}")

Number of rows after removing missing values: 29982


In [92]:
tennis_df.head(10)

Unnamed: 0,Surface,WRank,LRank,WPts,LPts,B365W,B365L
0,0,28.0,57.0,1215.0,778.0,1.36,3.0
1,0,41.0,35.0,927.0,1075.0,1.61,2.2
2,0,19.0,49.0,1830.0,845.0,1.25,3.75
3,0,36.0,326.0,1070.0,137.0,1.07,9.0
4,0,43.0,30.0,897.0,1175.0,1.9,1.8
5,0,199.0,79.0,239.0,655.0,1.61,2.2
6,0,54.0,104.0,809.0,530.0,2.2,1.61
7,0,29.0,137.0,1177.0,402.0,1.44,2.62
8,0,114.0,69.0,495.0,710.0,3.0,1.36
9,0,48.0,61.0,866.0,756.0,1.36,3.0


Currently, the data is structured with a winner and loser, but we want to predict player1 and player2. 

We'll create columns for player1 and player2 stats, and also randomly flip some (half) of the players such that player1 won't always be the winner otherwise the model will learn that behavior.

In [93]:
# Create new columns for player1 and player2
# First, create a random mask to determine which rows to flip
np.random.seed(42)  # for reproducibility
flip_mask = np.random.rand(len(tennis_df)) < 0.5

# Initialize new columns
tennis_df['P1Rank'] = np.where(flip_mask, tennis_df['LRank'], tennis_df['WRank'])
tennis_df['P2Rank'] = np.where(flip_mask, tennis_df['WRank'], tennis_df['LRank'])

tennis_df['P1Pts'] = np.where(flip_mask, tennis_df['LPts'], tennis_df['WPts'])
tennis_df['P2Pts'] = np.where(flip_mask, tennis_df['WPts'], tennis_df['LPts'])

tennis_df['P1B365'] = np.where(flip_mask, tennis_df['B365L'], tennis_df['B365W'])
tennis_df['P2B365'] = np.where(flip_mask, tennis_df['B365W'], tennis_df['B365L'])

# Add label (0 if player1 wins, 1 if player2 wins)
tennis_df['label'] = flip_mask.astype(int)

# Drop the original columns as they're no longer needed
tennis_df = tennis_df.drop(columns=['WRank', 'LRank', 'WPts', 'LPts', 'B365W', 'B365L'])

tennis_df.head()

Unnamed: 0,Surface,P1Rank,P2Rank,P1Pts,P2Pts,P1B365,P2B365,label
0,0,57.0,28.0,778.0,1215.0,3.0,1.36,1
1,0,41.0,35.0,927.0,1075.0,1.61,2.2,0
2,0,19.0,49.0,1830.0,845.0,1.25,3.75,0
3,0,36.0,326.0,1070.0,137.0,1.07,9.0,0
4,0,30.0,43.0,1175.0,897.0,1.8,1.9,1


In [94]:
# Train test split
X = tennis_df.drop(columns=['label'])
y = tennis_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
# Primary model

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3]
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_

# Predict
y_pred = best_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print('XGBoost Accuracy: ', accuracy_score(y_test, y_pred))

# Save the model
joblib.dump(best_model, '../models/tennis_model.joblib')

print("Model 1 saved to ../models/ directory")


Fitting 5 folds for each of 144 candidates, totalling 720 fits



Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.8}
Best cross-validation score: 0.6901
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      2992
           1       0.68      0.69      0.69      3005

    accuracy                           0.68      5997
   macro avg       0.68      0.68      0.68      5997
weighted avg       0.68      0.68      0.68      5997

XGBoost Accuracy:  0.6838419209604802
Model 1 saved to ../models/ directory


In [96]:
# Make a secondary model that doesn't use the rank or points
X = tennis_df.drop(columns=['label', 'P1Rank', 'P2Rank', 'P1Pts', 'P2Pts'])
y = tennis_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3]
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print('XGBoost2 Accuracy: ', accuracy_score(y_test, y_pred))

# Save the model
joblib.dump(best_model, '../models/tennis_model2.joblib')

print("Model 2 saved to ../models/ directory")


Fitting 5 folds for each of 144 candidates, totalling 720 fits

Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.8}
Best cross-validation score: 0.6895
              precision    recall  f1-score   support

           0       0.69      0.68      0.68      2992
           1       0.68      0.69      0.69      3005

    accuracy                           0.68      5997
   macro avg       0.68      0.68      0.68      5997
weighted avg       0.68      0.68      0.68      5997

XGBoost2 Accuracy:  0.6848424212106053
Model 2 saved to ../models/ directory


In [97]:
# Make a tertiary model that uses the rank and points
X = tennis_df.drop(columns=['label', 'P1B365', 'P2B365'])
y = tennis_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3]
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print('XGBoost3 Accuracy: ', accuracy_score(y_test, y_pred))

# Save the model
joblib.dump(best_model, '../models/tennis_model3.joblib')

print("Model 3 saved to ../models/ directory")

Fitting 5 folds for each of 144 candidates, totalling 720 fits

Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 50, 'subsample': 0.8}
Best cross-validation score: 0.6444
              precision    recall  f1-score   support

           0       0.65      0.65      0.65      2992
           1       0.65      0.65      0.65      3005

    accuracy                           0.65      5997
   macro avg       0.65      0.65      0.65      5997
weighted avg       0.65      0.65      0.65      5997

XGBoost3 Accuracy:  0.6513256628314157
Model 3 saved to ../models/ directory
