In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd



matches_with_features = pd.read_csv('tennis_matches_enriched.csv', parse_dates=['date'])


predictors = [
    "playerCode", "opponentCode",
    "Rank_1", "Rank_2", "Pts_1", "Pts_2",
    "h2h_matches", "h2h_win_pct", "h2h_surface_matches", "h2h_surface_win_pct",
    "recent_matches_p1", "recent_win_pct_p1", "recent_avg_opp_rank_p1",
    "recent_matches_p2", "recent_win_pct_p2", "recent_avg_opp_rank_p2",
    "win_pct_surface_p1", "win_pct_surface_p2",
    "series_level", "round_num", "best_of",
    "implied_prob_p1", "implied_prob_p2",
]

# Split data - Updated version
train = matches_with_features[
    (matches_with_features['date'] >= '2000-01-03') & 
    (matches_with_features['date'] <= '2022-12-31')  # Train on first 23 years
]

test = matches_with_features[
    (matches_with_features['date'] >= '2023-01-01') & 
    (matches_with_features['date'] <= '2024-06-29')  # Test on most recent 1.5 years
]

# Optionally split train further to create a proper validation stack
X_train, X_valid, y_train, y_valid = train_test_split(
    train[predictors], train["target"], test_size=0.2, random_state=42
)

# Base models
xgb_model = XGBClassifier(
    subsample=1.0,
    n_estimators=700,
    min_child_weight=10,
    max_depth=4,
    learning_rate=0.1,
    gamma=5,
    colsample_bytree=0.6,
    random_state=1,
    n_jobs=-1
)

rf_model = RandomForestClassifier(
    n_estimators=400,
    min_samples_split=40,
    min_samples_leaf=20,
    max_features=5,
    random_state=1,
    n_jobs=-1
)

# Meta-model: Logistic Regression
meta_model = LogisticRegression()

# Stacking
stacked_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=meta_model,
    passthrough=False,   # set to True if you want raw features also fed to meta model
    cv=5,
    n_jobs=-1
)

# Fit stacking model
stacked_model.fit(X_train, y_train)

# Evaluate on validation
valid_accuracy = stacked_model.score(X_valid, y_valid)
print(f"Validation accuracy (stacked): {valid_accuracy:.3f}")

# Final evaluation on test
test_accuracy = stacked_model.score(test[predictors], test["target"])
print(f"Test accuracy (stacked): {test_accuracy:.3f}")


Validation accuracy (stacked): 0.691
Test accuracy (stacked): 0.680


In this first attempt we run a logistic regresion to give importance to both methods xbg and rf, we get the best accuracy for the moment which is 0.68

Same with neural network

In [5]:
from sklearn.neural_network import MLPClassifier

mlp_meta = MLPClassifier(
    hidden_layer_sizes=(32, 16),  # Add another layer
    activation='relu',
    solver='adam',
    alpha=0.001,  # Add L2 regularization
    learning_rate='adaptive',  # Helps with convergence
    early_stopping=True,  # Prevents overfitting
    validation_fraction=0.1,
    max_iter=1000,
    random_state=1
)
# Stacking classifier
stacked_nn_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=mlp_meta,
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# Fit model
stacked_nn_model.fit(X_train, y_train)

# Evaluate
valid_acc = stacked_nn_model.score(X_valid, y_valid)
test_acc = stacked_nn_model.score(test[predictors], test["target"])

print(f"Validation accuracy (stacked + NN): {valid_acc:.3f}")
print(f"Test accuracy (stacked + NN): {test_acc:.3f}")

Validation accuracy (stacked + NN): 0.692
Test accuracy (stacked + NN): 0.679


With neural networks we try puting 2 layers but the accuracy doesn't increase and it stays at 0.679