In [6]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd



matches_with_features = pd.read_csv('tennis_matches_enriched.csv', parse_dates=['date'])


predictors = [
    "playerCode", "opponentCode",
    "Rank_1", "Rank_2", "Pts_1", "Pts_2",
    "h2h_matches", "h2h_win_pct", "h2h_surface_matches", "h2h_surface_win_pct",
    "recent_matches_p1", "recent_win_pct_p1", "recent_avg_opp_rank_p1",
    "recent_matches_p2", "recent_win_pct_p2", "recent_avg_opp_rank_p2",
    "win_pct_surface_p1", "win_pct_surface_p2",
    "series_level", "round_num", "best_of",
    "implied_prob_p1", "implied_prob_p2",
]

# Split data - Updated version
train = matches_with_features[
    (matches_with_features['date'] >= '2000-01-03') & 
    (matches_with_features['date'] <= '2022-12-31')  # Train on first 23 years
]

test = matches_with_features[
    (matches_with_features['date'] >= '2023-01-01') & 
    (matches_with_features['date'] <= '2024-06-29')  # Test on most recent 1.5 years
]

# Optionally split train further to create a proper validation stack
X_train, X_valid, y_train, y_valid = train_test_split(
    train[predictors], train["target"], test_size=0.2, random_state=42
)

# Base models
xgb_model = XGBClassifier(
    subsample=1.0,
    n_estimators=700,
    min_child_weight=10,
    max_depth=4,
    learning_rate=0.1,
    gamma=5,
    colsample_bytree=0.6,
    random_state=1,
    n_jobs=-1
)

rf_model = RandomForestClassifier(
    n_estimators=400,
    min_samples_split=40,
    min_samples_leaf=20,
    max_features=5,
    random_state=1,
    n_jobs=-1
)

# Meta-model: Logistic Regression
meta_model = LogisticRegression()

# Stacking
stacked_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=meta_model,
    passthrough=False,   # set to True if you want raw features also fed to meta model
    cv=5,
    n_jobs=-1
)

# Fit stacking model
stacked_model.fit(X_train, y_train)

# Evaluate on validation
valid_accuracy = stacked_model.score(X_valid, y_valid)
print(f"Validation accuracy (stacked): {valid_accuracy:.3f}")

# Final evaluation on test
test_accuracy = stacked_model.score(test[predictors], test["target"])
print(f"Test accuracy (stacked): {test_accuracy:.3f}")


Validation accuracy (stacked): 0.691
Test accuracy (stacked): 0.680


In this first attempt we run a logistic regresion to give importance to both methods xbg and rf, we get the best accuracy for the moment which is 0.68

Same with neural network

In [7]:
from sklearn.neural_network import MLPClassifier

mlp_meta = MLPClassifier(
    hidden_layer_sizes=(32, 16),  # Add another layer
    activation='relu',
    solver='adam',
    alpha=0.001,  # Add L2 regularization
    learning_rate='adaptive',  # Helps with convergence
    early_stopping=True,  # Prevents overfitting
    validation_fraction=0.1,
    max_iter=1000,
    random_state=1
)
# Stacking classifier
stacked_nn_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=mlp_meta,
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# Fit model
stacked_nn_model.fit(X_train, y_train)

# Evaluate
valid_acc = stacked_nn_model.score(X_valid, y_valid)
test_acc = stacked_nn_model.score(test[predictors], test["target"])

print(f"Validation accuracy (stacked + NN): {valid_acc:.3f}")
print(f"Test accuracy (stacked + NN): {test_acc:.3f}")

Validation accuracy (stacked + NN): 0.692
Test accuracy (stacked + NN): 0.679


With neural networks we try puting 2 layers but the accuracy doesn't increase and it stays at 0.679


This section trains and evaluates a gradient boosting model using CatBoost.

In [8]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Define categorical features for CatBoost
cat_features = ["playerCode", "opponentCode", "series_level", "round_num", "best_of"]

# Convert categorical features to string (recommended for CatBoost)
for col in cat_features:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

# Define and train CatBoost model with early stopping and validation set
model_cat = CatBoostClassifier(
    iterations=8000,          # More iterations with early stopping
    depth=10,                # Medium depth (between 6-8)
    learning_rate=0.02,      # Lower learning rate for better accuracy
    l2_leaf_reg=10,          # Stronger regularization
    border_count=254,        # More splits for continuous features (default optimal)
    random_strength=1,       # Controls randomness in splits
    bagging_temperature=0.5, # Smooths bootstrap (useful for small data)
    loss_function='Logloss',
    eval_metric='Accuracy',  # Optimize accuracy directly
    cat_features=cat_features,
    early_stopping_rounds=50,# More patience for convergence
    verbose=200,
    random_state=42
)

model_cat.fit(
    train[predictors], train["target"],
    eval_set=(test[predictors], test["target"])
)

# Evaluate
acc = model_cat.score(test[predictors], test['target'])
print(f"\nCatBoost Accuracy: {acc:.3f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = train[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = train[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

0:	learn: 0.6865915	test: 0.6767701	best: 0.6767701 (0)	total: 265ms	remaining: 35m 18s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6818622696
bestIteration = 23

Shrink model to first 24 iterations.

CatBoost Accuracy: 0.682


In this version, we trained a CatBoost model using its built-in support for categorical variables and robust boosting strategy. The accuracy achieved was comparable to previous tree-based methods, reaching up to 0.684.


Here we use neural nets but applied to tabular data; TabNET

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier
import torch


# Secure label encoding (mixes train+test)
for col in cat_features:
    le = LabelEncoder()
    all_vals = pd.concat([train[col], test[col]]).astype(str)
    le.fit(all_vals)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Array to numpy conversion
X_train = train[predictors].values
y_train = train['target'].values
X_test = test[predictors].values
y_test = test['target'].values

# correct indexs and cathegorical dimensions
cat_idxs = [predictors.index(c) for c in cat_features]

cat_dims = []
for c in cat_features:
    max_val = max(train[c].max(), test[c].max())
    cat_dims.append(int(max_val) + 1)

# TabNet model
clf = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=8,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=10,
    seed=42,
)

# Training
clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=['accuracy'],
    patience=20,
    max_epochs=200,
    batch_size=1024,
    virtual_batch_size=128
)

# Prediction and results
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n TabNet Accuracy: {acc:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a D

epoch 0  | loss: 0.65379 | val_0_accuracy: 0.62003 |  0:00:03s
epoch 10 | loss: 0.58062 | val_0_accuracy: 0.68404 |  0:00:33s
epoch 20 | loss: 0.57627 | val_0_accuracy: 0.67507 |  0:01:05s
epoch 30 | loss: 0.55778 | val_0_accuracy: 0.67386 |  0:01:36s

Early stopping occurred at epoch 30 with best_epoch = 10 and best_val_0_accuracy = 0.68404





 TabNet Accuracy: 0.6840


When we applied TabNet, a deep learning architecture designed for tabular data, we initially tried tuning it with Optuna. However, this did not lead to accuracy improvements and significantly increased the training time. On its own, TabNet achieved an accuracy of around 0.682, which is competitive with gradient boosting models and shows potential for further improvement with additional features or longer training.

